diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
deleted file mode 100644
index bbed80ebe847..000000000000
--- a/.buildkite/generate_index.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-template = """<!DOCTYPE html>
-<html>
-    <body>
-    <h1>Links for vLLM</h1/>
-        <a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
-        <a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
-    </body>
-</html>
-"""
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--wheel", help="The wheel path.", required=True)
-args = parser.parse_args()
-
-filename = os.path.basename(args.wheel)
-
-with open("index.html", "w") as f:
-    print(f"Generated index.html for {args.wheel}")
-    # sync the abi tag with .buildkite/scripts/upload-wheels.sh
-    if "x86_64" in filename:
-        x86_wheel = filename
-        arm_wheel = filename.replace("x86_64", "aarch64").replace(
-            "manylinux1", "manylinux2014"
-        )
-    elif "aarch64" in filename:
-        x86_wheel = filename.replace("aarch64", "x86_64").replace(
-            "manylinux2014", "manylinux1"
-        )
-        arm_wheel = filename
-    else:
-        raise ValueError(f"Unsupported wheel: {filename}")
-    # cloudfront requires escaping the '+' character
-    f.write(
-        template.format(
-            x86_wheel=x86_wheel,
-            x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
-            arm_wheel=arm_wheel,
-            arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
-        )
-    )
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 38c400ba1faf..fbfc923998f8 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
     commands:
       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       - "bash .buildkite/scripts/upload-wheels.sh"
@@ -30,19 +30,6 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   # x86 + CUDA builds
-  - label: "Build wheel - CUDA 12.8"
-    depends_on: ~
-    id: build-wheel-cuda-12-8
-    agents:
-      queue: cpu_queue_postmerge
-    commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
-      - "mkdir artifacts"
-      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      - "bash .buildkite/scripts/upload-wheels.sh"
-    env:
-      DOCKER_BUILDKIT: "1"
-
   - label: "Build wheel - CUDA 12.9"
     depends_on: ~
     id: build-wheel-cuda-12-9
@@ -109,7 +96,6 @@ steps:
   - label: "Annotate release workflow"
     depends_on:
       - create-multi-arch-manifest
-      - build-wheel-cuda-12-8
     id: annotate-release-workflow
     agents:
       queue: cpu_queue_postmerge
diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh
index 56bb5cedaa0a..df805e085080 100755
--- a/.buildkite/scripts/annotate-release.sh
+++ b/.buildkite/scripts/annotate-release.sh
@@ -23,8 +23,8 @@ To download the wheel (by version):
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 
-aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu130/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux1_x86_64.whl .
 \`\`\`
 
 To download and upload the image:
@@ -45,9 +45,10 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker push vllm/vllm-openai:latest-aarch64
 docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 
-docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
-docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
+docker manifest rm vllm/vllm-openai:latest
+docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64
+docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 docker manifest push vllm/vllm-openai:latest
 docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 \`\`\`
-EOF 
\ No newline at end of file
+EOF 
diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py
new file mode 100644
index 000000000000..a61f08107647
--- /dev/null
+++ b/.buildkite/scripts/generate-nightly-index.py
@@ -0,0 +1,368 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# do not complain about line length (for docstring)
+# ruff: noqa: E501
+
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from typing import Any
+from urllib.parse import quote
+
+if not sys.version_info >= (3, 10):
+    raise RuntimeError("This script requires Python 3.10 or higher.")
+
+INDEX_HTML_TEMPLATE = """<!DOCTYPE html>
+<html>
+  <meta name="pypi:repository-version" content="1.0">
+  <body>
+{items}
+  </body>
+</html>
+"""
+
+
+@dataclass
+class WheelFileInfo:
+    package_name: str
+    version: str
+    build_tag: str | None
+    python_tag: str
+    abi_tag: str
+    platform_tag: str
+    variant: str | None
+    filename: str
+
+
+def parse_from_filename(file: str) -> WheelFileInfo:
+    """
+    Parse wheel file name to extract metadata.
+
+    The format of wheel names:
+        {package_name}-{version}(-{build_tag})?-{python_tag}-{abi_tag}-{platform_tag}.whl
+    All versions could contain a variant like '+cu129' or '.cpu' or `.rocm` (or not).
+    Example:
+        vllm-0.11.0-cp38-abi3-manylinux1_x86_64.whl
+        vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0-cp38-abi3-manylinux2014_aarch64.whl
+        vllm-0.11.1rc8.dev14+gaa384b3c0.cu130-cp38-abi3-manylinux1_x86_64.whl
+    """
+    wheel_file_re = re.compile(
+        r"^(?P<package_name>.+)-(?P<version>[^-]+?)(-(?P<build_tag>[^-]+))?-(?P<python_tag>[^-]+)-(?P<abi_tag>[^-]+)-(?P<platform_tag>[^-]+)\.whl$"
+    )
+    match = wheel_file_re.match(file)
+    if not match:
+        raise ValueError(f"Invalid wheel file name: {file}")
+
+    package_name = match.group("package_name")
+    version = match.group("version")
+    build_tag = match.group("build_tag")
+    python_tag = match.group("python_tag")
+    abi_tag = match.group("abi_tag")
+    platform_tag = match.group("platform_tag")
+
+    # extract variant from version
+    variant = None
+    if "dev" in version:
+        ver_after_dev = version.split("dev")[-1]
+        if "." in ver_after_dev:
+            variant = ver_after_dev.split(".")[-1]
+            version = version.removesuffix("." + variant)
+    else:
+        if "+" in version:
+            version, variant = version.split("+")
+
+    return WheelFileInfo(
+        package_name=package_name,
+        version=version,
+        build_tag=build_tag,
+        python_tag=python_tag,
+        abi_tag=abi_tag,
+        platform_tag=platform_tag,
+        variant=variant,
+        filename=file,
+    )
+
+
+def generate_project_list(subdir_names: list[str]) -> str:
+    """
+    Generate project list HTML content linking to each project & variant sub-directory.
+    """
+    href_tags = []
+    for name in sorted(subdir_names):
+        name = name.strip("/").strip(".")
+        href_tags.append(f'    <a href="{name}/">{name}/</a><br/>')
+    return INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+
+
+def generate_package_index_and_metadata(
+    wheel_files: list[WheelFileInfo], wheel_base_dir: Path, index_base_dir: Path
+) -> tuple[str, str]:
+    """
+    Generate package index HTML content for a specific package, linking to actual wheel files.
+    """
+    href_tags = []
+    metadata = []
+    for file in sorted(wheel_files, key=lambda x: x.filename):
+        relative_path = (
+            wheel_base_dir.relative_to(index_base_dir, walk_up=True) / file.filename
+        )
+        href_tags.append(
+            f'    <a href="{quote(relative_path.as_posix())}">{file.filename}</a><br/>'
+        )
+        file_meta = asdict(file)
+        file_meta["path"] = relative_path.as_posix()
+        metadata.append(file_meta)
+    index_str = INDEX_HTML_TEMPLATE.format(items="\n".join(href_tags))
+    metadata_str = json.dumps(metadata, indent=2)
+    return index_str, metadata_str
+
+
+def generate_index_and_metadata(
+    whl_files: list[str],
+    wheel_base_dir: Path,
+    index_base_dir: Path,
+    default_variant: str | None = None,
+    alias_to_default: str | None = None,
+):
+    """
+    Generate index for all wheel files.
+
+    Args:
+        whl_files (list[str]): List of wheel files (must be directly under `wheel_base_dir`).
+        wheel_base_dir (Path): Base directory for wheel files.
+        index_base_dir (Path): Base directory to store index files.
+        default_variant (str | None): The default variant name, if any.
+        alias_to_default (str | None): Alias variant name for the default variant, if any.
+
+    First, parse all wheel files to extract metadata.
+    We need to collect all wheel files for each variant, and generate an index for it (in a sub-directory).
+    The index for the default variant (if any) is generated in the root index directory.
+
+    If `default_variant` is provided, all wheels must have variant suffixes, and the default variant index
+    is purely a copy of the corresponding variant index, with only the links adjusted.
+    Otherwise, all wheels without variant suffixes are treated as the default variant.
+
+    If `alias_to_default` is provided, an additional alias sub-directory is created, it has the same content
+    as the default variant index, but the links are adjusted accordingly.
+
+    Index directory structure:
+        index_base_dir/ (hosted at wheels.vllm.ai/{nightly,$commit,$version}/)
+            index.html  # project list, linking to "vllm/" and other packages, and all variant sub-directories
+            vllm/
+                index.html # package index, pointing to actual files in wheel_base_dir (relative path)
+                metadata.json # machine-readable metadata for all wheels in this package
+            cpu/ # cpu variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu129/ # cu129 is actually the alias to default variant
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            cu130/ # cu130 variant sub-directory
+                index.html
+                vllm/
+                    index.html
+                    metadata.json
+            ...
+
+    metadata.json stores a dump of all wheel files' metadata in a machine-readable format:
+        [
+            {
+                "package_name": "vllm",
+                "version": "0.10.2rc2",
+                "build_tag": null,
+                "python_tag": "cp38",
+                "abi_tag": "abi3",
+                "platform_tag": "manylinux2014_aarch64",
+                "variant": "cu129",
+                "filename": "vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl",
+                "path": "../vllm-0.10.2rc2+cu129-cp38-abi3-manylinux2014_aarch64.whl" # to be concatenated with the directory URL
+            },
+            ...
+        ]
+    """
+
+    parsed_files = [parse_from_filename(f) for f in whl_files]
+
+    if not parsed_files:
+        print("No wheel files found, skipping index generation.")
+        return
+
+    # Group by variant
+    variant_to_files: dict[str, list[WheelFileInfo]] = {}
+    for file in parsed_files:
+        variant = file.variant or "default"
+        if variant not in variant_to_files:
+            variant_to_files[variant] = []
+        variant_to_files[variant].append(file)
+
+    print(f"Found variants: {list(variant_to_files.keys())}")
+
+    # sanity check for default variant
+    if default_variant:
+        if "default" in variant_to_files:
+            raise ValueError(
+                "All wheel files must have variant suffixes when `default_variant` is specified."
+            )
+        if default_variant not in variant_to_files:
+            raise ValueError(
+                f"Default variant '{default_variant}' not found among wheel files."
+            )
+
+    if alias_to_default:
+        if "default" not in variant_to_files:
+            # e.g. only some wheels are uploaded to S3 currently
+            print(
+                "[WARN] Alias to default variant specified, but no default variant found."
+            )
+        elif alias_to_default in variant_to_files:
+            raise ValueError(
+                f"Alias variant name '{alias_to_default}' already exists among wheel files."
+            )
+        else:
+            variant_to_files[alias_to_default] = variant_to_files["default"].copy()
+            print(f"Alias variant '{alias_to_default}' created for default variant.")
+
+    # Generate index for each variant
+    subdir_names = set()
+    for variant, files in variant_to_files.items():
+        if variant == "default":
+            variant_dir = index_base_dir
+        else:
+            variant_dir = index_base_dir / variant
+            subdir_names.add(variant)
+
+        variant_dir.mkdir(parents=True, exist_ok=True)
+
+        # gather all package names in this variant
+        packages = set(f.package_name for f in files)
+        if variant == "default":
+            # these packages should also appear in the "project list"
+            # generate after all variants are processed
+            subdir_names = subdir_names.union(packages)
+        else:
+            # generate project list for this variant directly
+            project_list_str = generate_project_list(sorted(packages))
+            with open(variant_dir / "index.html", "w") as f:
+                f.write(project_list_str)
+
+        for package in packages:
+            # filter files belonging to this package only
+            package_files = [f for f in files if f.package_name == package]
+            package_dir = variant_dir / package
+            package_dir.mkdir(parents=True, exist_ok=True)
+            index_str, metadata_str = generate_package_index_and_metadata(
+                package_files, wheel_base_dir, package_dir
+            )
+            with open(package_dir / "index.html", "w") as f:
+                f.write(index_str)
+            with open(package_dir / "metadata.json", "w") as f:
+                f.write(metadata_str)
+
+    # Generate top-level project list index
+    project_list_str = generate_project_list(sorted(subdir_names))
+    with open(index_base_dir / "index.html", "w") as f:
+        f.write(project_list_str)
+
+
+if __name__ == "__main__":
+    """
+    Arguments:
+        --version <version> : version string for the current build (e.g., commit hash)
+        --current-objects <path_to_json> : path to JSON file containing current S3 objects listing in this version directory
+        --output-dir <output_directory> : directory to store generated index files
+        --alias-to-default <alias_variant_name> : (optional) alias variant name for the default variant
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Process nightly build wheel files to generate indices."
+    )
+    parser.add_argument(
+        "--version",
+        type=str,
+        required=True,
+        help="Version string for the current build (e.g., commit hash)",
+    )
+    parser.add_argument(
+        "--current-objects",
+        type=str,
+        required=True,
+        help="Path to JSON file containing current S3 objects listing in this version directory",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to store generated index files",
+    )
+    parser.add_argument(
+        "--alias-to-default",
+        type=str,
+        default=None,
+        help="Alias variant name for the default variant",
+    )
+
+    args = parser.parse_args()
+
+    version = args.version
+    if "/" in version or "\\" in version:
+        raise ValueError("Version string must not contain slashes.")
+    current_objects_path = Path(args.current_objects)
+    output_dir = Path(args.output_dir)
+    if not output_dir.exists():
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Read current objects JSON
+    with open(current_objects_path) as f:
+        current_objects: dict[str, list[dict[str, Any]]] = json.load(f)
+
+    # current_objects looks like from list_objects_v2 S3 API:
+    """
+    "Contents": [
+        {
+            "Key": "e2f56c309d2a28899c68975a7e104502d56deb8f/vllm-0.11.2.dev363+ge2f56c309-cp38-abi3-manylinux1_x86_64.whl",
+            "LastModified": "2025-11-28T14:00:32+00:00",
+            "ETag": "\"37a38339c7cdb61ca737021b968075df-52\"",
+            "ChecksumAlgorithm": [
+                "CRC64NVME"
+            ],
+            "ChecksumType": "FULL_OBJECT",
+            "Size": 435649349,
+            "StorageClass": "STANDARD"
+        },
+        ...
+    ]
+    """
+
+    # Extract wheel file keys
+    wheel_files = []
+    for item in current_objects.get("Contents", []):
+        key: str = item["Key"]
+        if key.endswith(".whl"):
+            wheel_files.append(key.split("/")[-1])  # only the filename is used
+
+    print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}")
+
+    # Generate index and metadata, assuming wheels and indices are stored as:
+    # s3://vllm-wheels/{version}/<wheel files>
+    # s3://vllm-wheels/<anything>/<index files>
+    wheel_base_dir = Path(output_dir).parent / version
+    index_base_dir = Path(output_dir)
+
+    generate_index_and_metadata(
+        whl_files=wheel_files,
+        wheel_base_dir=wheel_base_dir,
+        index_base_dir=index_base_dir,
+        default_variant=None,
+        alias_to_default=args.alias_to_default,
+    )
+    print(f"Successfully generated index and metadata in {output_dir}")
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 0e5b21ddf25b..864eb470bb0a 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -59,7 +59,7 @@ while true; do
         fi
 done
 
-echo "--- Pulling container" 
+echo "--- Pulling container"
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
@@ -177,13 +177,13 @@ if [[ -z "$render_gid" ]]; then
   exit 1
 fi
 
-# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. 
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
 if [[ $commands == *"--shard-id="* ]]; then
-  # assign job count as the number of shards used   
-  commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
   for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
     # assign shard-id for each shard
-    commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
     echo "Shard ${GPU} commands:$commands_gpu"
     echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
     docker run \
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
new file mode 100755
index 000000000000..b5f6b2494792
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-0-16}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-0-16}
+
+export CMAKE_BUILD_PARALLEL_LEVEL=16
+
+# Setup cleanup
+remove_docker_container() {
+    set -e;
+    docker rm -f cpu-test || true;
+}
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Try building the docker image
+docker build --tag cpu-test --target vllm-test -f docker/Dockerfile.cpu .
+
+# Run the image
+docker run -itd --cpuset-cpus="$CORE_RANGE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test cpu-test
+
+function cpu_tests() {
+  set -e
+
+  docker exec cpu-test bash -c "
+    set -e
+    pip list"
+
+  # offline inference
+  docker exec cpu-test bash -c "
+    set -e
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
+
+  # Run kernel tests
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -x -v -s tests/kernels/test_onednn.py
+    pytest -x -v -s tests/kernels/attention/test_cpu_attn.py"
+
+  # basic online serving
+  docker exec cpu-test bash -c '
+    set -e
+    VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS vllm serve Qwen/Qwen3-0.6B --max-model-len 2048 &
+    server_pid=$!
+    timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
+    vllm bench serve \
+      --backend vllm \
+      --dataset-name random \
+      --model Qwen/Qwen3-0.6B \
+      --num-prompts 20 \
+      --endpoint /v1/completions
+    kill -s SIGTERM $server_pid &'
+}
+
+# All of CPU tests are expected to be finished less than 40 mins.
+export -f cpu_tests
+timeout 2h bash -c cpu_tests
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
index 39ea18017308..3728f73fa2a3 100755
--- a/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
@@ -25,20 +25,22 @@ function cpu_tests() {
 
   # offline inference
   podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
     set -xve
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
 
   # Run basic model test
   podman exec -it "$container_id" bash -c "
+    export TORCH_COMPILE_DISABLE=1
     set -evx
     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
-    pip install sentence-transformers datamodel_code_generator
+    pip install sentence-transformers datamodel_code_generator tblib 
 
     # Note: disable Bart until supports V1
     # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
-    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-openai-community/gpt2]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-facebook/opt-125m]
+    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
     # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
     # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
index 7479c43977d7..438fe522c870 100644
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -21,8 +21,8 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --progress plain --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
@@ -73,12 +73,11 @@ function cpu_tests() {
     pytest -x -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
 
-  # Note: disable it until supports V1
-  # Run AWQ test
-  # docker exec cpu-test-"$NUMA_NODE" bash -c "
-  #   set -e
-  #   pytest -x -s -v \
-  #   tests/quantization/test_ipex_quant.py"
+  # Run AWQ/GPTQ test
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pytest -x -s -v \
+    tests/quantization/test_cpu_wna16.py"
 
   # Run multi-lora tests
   docker exec cpu-test-"$NUMA_NODE" bash -c "
diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
index 27ed67c4517e..4d163399cfc6 100644
--- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh
@@ -35,7 +35,7 @@ docker run \
     echo $ZE_AFFINITY_MASK
     pip install tblib==3.1.0
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
-    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
+    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -cc.cudagraph_mode=NONE
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
     VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
@@ -46,6 +46,6 @@ docker run \
     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
     pytest -v -s v1/structured_output
     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
-    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
+    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py
     pytest -v -s v1/test_serial_utils.py
 '
diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
index 5302f524a0ae..8106f50f18f6 100644
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -17,7 +17,17 @@ wait_for_server() {
 }
 
 MODEL="deepseek-ai/DeepSeek-V2-lite"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
 
 cleanup() {
   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
diff --git a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
similarity index 64%
rename from .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
rename to .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
index a5135299297e..6a1bef275d04 100644
--- a/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh
+++ b/.buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh
@@ -1,10 +1,12 @@
 #!/usr/bin/env bash
 set -euxo pipefail
 
-# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
+# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] [DATA_PARALLEL_SIZE] [TENSOR_PARALLEL_SIZE]
 THRESHOLD=${1:-0.8}
 NUM_Q=${2:-1319}
 PORT=${3:-8020}
+DATA_PARALLEL_SIZE=${4:-2}
+TENSOR_PARALLEL_SIZE=${5:-2}
 OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
 mkdir -p "${OUT_DIR}"
 
@@ -17,7 +19,16 @@ wait_for_server() {
 }
 
 MODEL="QWen/Qwen3-30B-A3B-FP8"
-BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+# Set BACKENDS based on platform
+if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
+  # ROCm platform
+  BACKENDS=("allgather_reducescatter")
+  # Disable MOE padding for ROCm since it is causing eplb to fail
+  export VLLM_ROCM_MOE_PADDING=0
+else
+  # Non-ROCm platform (CUDA/other)
+  BACKENDS=("deepep_high_throughput" "deepep_low_latency")
+fi
 
 cleanup() {
   if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
@@ -36,8 +47,10 @@ for BACK in "${BACKENDS[@]}"; do
   VLLM_ALL2ALL_BACKEND=$BACK \
   vllm serve "$MODEL" \
     --enforce-eager \
-    --tensor-parallel-size 2 \
-    --data-parallel-size 2 \
+    --enable-eplb \
+    --eplb-config '{"window_size":10, "step_interval":100, "num_redundant_experts":0, "log_balancedness":true}' \
+    --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
+    --data-parallel-size ${DATA_PARALLEL_SIZE} \
     --enable-expert-parallel \
     --trust-remote-code \
     --max-model-len 2048 \
diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh
index 945c5e48c009..05accb9cf16d 100644
--- a/.buildkite/scripts/upload-wheels.sh
+++ b/.buildkite/scripts/upload-wheels.sh
@@ -2,6 +2,28 @@
 
 set -ex
 
+# ======== part 0: setup ========
+
+BUCKET="vllm-wheels"
+INDICES_OUTPUT_DIR="indices"
+DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
+PYTHON=${PYTHON_PROG:=python3} # try to read from env var, otherwise use python3
+SUBPATH=$BUILDKITE_COMMIT
+S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
+
+# detect if python3.10+ is available
+has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,10) else 0)")
+if [[ "$has_new_python" -eq 0 ]]; then
+    # use new python from docker
+    docker pull python:3-slim
+    PYTHON="docker run --rm -v $(pwd):/app -w /app python:3-slim python3"
+fi
+
+echo "Using python interpreter: $PYTHON"
+echo "Python version: $($PYTHON --version)"
+
+# ========= part 1: collect, rename & upload the wheel ==========
+
 # Assume wheels are in artifacts/dist/*.whl
 wheel_files=(artifacts/dist/*.whl)
 
@@ -10,74 +32,69 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then
   echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
   exit 1
 fi
-
-# Get the single wheel file
 wheel="${wheel_files[0]}"
 
-# Detect architecture and rename 'linux' to appropriate manylinux version
-arch=$(uname -m)
-if [[ $arch == "x86_64" ]]; then
-    manylinux_version="manylinux1"
-elif [[ $arch == "aarch64" ]]; then
-    manylinux_version="manylinux2014"
-else
-    echo "Warning: Unknown architecture $arch, using manylinux1 as default"
-    manylinux_version="manylinux1"
-fi
+# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31
+# refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels
+manylinux_version="manylinux_2_31"
 
 # Rename 'linux' to the appropriate manylinux version in the wheel filename
+if [[ "$wheel" != *"linux"* ]]; then
+  echo "Error: Wheel filename does not contain 'linux': $wheel"
+  exit 1
+fi
 new_wheel="${wheel/linux/$manylinux_version}"
 mv -- "$wheel" "$new_wheel"
 wheel="$new_wheel"
+echo "Renamed wheel to: $wheel"
 
 # Extract the version from the wheel
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
-echo "Version: $version"
-
-normal_wheel="$wheel" # Save the original wheel filename
-
-# If the version contains "dev", rename it to v1.0.0.dev for consistency
-if [[ $version == *dev* ]]; then
-    suffix="${version##*.}"
-    if [[ $suffix == cu* ]]; then
-        new_version="1.0.0.dev+${suffix}"
-    else
-        new_version="1.0.0.dev"
-    fi
-    new_wheel="${wheel/$version/$new_version}"
-    # use cp to keep both files in the artifacts directory
-    cp -- "$wheel" "$new_wheel"
-    wheel="$new_wheel"
-    version="$new_version"
-fi
+echo "Version in wheel: $version"
+pure_version="${version%%+*}"
+echo "Pure version (without variant): $pure_version"
 
-# Upload the wheel to S3
-python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+# copy wheel to its own bucket
+aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
 
-# generate index for this commit
-aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+# ========= part 2: generate and upload indices ==========
+# generate indices for all existing wheels in the commit directory
+# this script might be run multiple times if there are multiple variants being built
+# so we need to guarantee there is little chance for "TOCTOU" issues
+# i.e., one process is generating indices while another is uploading a new wheel
+# so we need to ensure no time-consuming operations happen below
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
-    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+# list all wheels in the commit directory
+echo "Existing wheels on S3:"
+aws s3 ls "$S3_COMMIT_PREFIX"
+obj_json="objects.json"
+aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$SUBPATH/" --delimiter / --output json > "$obj_json"
+mkdir -p "$INDICES_OUTPUT_DIR"
+
+# call script to generate indicies for all existing wheels
+# this indices have relative paths that could work as long as it is next to the wheel directory in s3
+# i.e., the wheels are always in s3://vllm-wheels/<commit>/
+# and indices can be placed in /<commit>/, or /nightly/, or /<version>/
+if [[ ! -z "$DEFAULT_VARIANT_ALIAS" ]]; then
+    alias_arg="--alias-to-default $DEFAULT_VARIANT_ALIAS"
 else
-    echo "Skipping index files for non-cu129 wheels"
+    alias_arg=""
 fi
 
-# generate index for nightly
-aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
-aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+$PYTHON .buildkite/scripts/generate-nightly-index.py --version "$SUBPATH" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" $alias_arg
 
-if [[ $normal_wheel == *"cu129"* ]]; then
-    # only upload index.html for cu129 wheels (default wheels) as it
-    # is available on both x86 and arm64
-    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
-else
-    echo "Skipping index files for non-cu129 wheels"
+# copy indices to /<commit>/ unconditionally
+echo "Uploading indices to $S3_COMMIT_PREFIX"
+aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
+
+# copy to /nightly/ only if it is on the main branch and not a PR 
+if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; then
+    echo "Uploading indices to overwrite /nightly/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/"
 fi
 
-aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
-aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
+# copy to /<pure_version>/ only if it does not have "dev" in the version
+if [[ "$version" != *"dev"* ]]; then
+    echo "Uploading indices to overwrite /$pure_version/"
+    aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/"
+fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 5fd048c2ad0c..687b6b08507c 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -61,8 +61,8 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -72,14 +72,18 @@ steps:
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
   - tests/transformers_utils
+  - tests/config
   no_gpu: true
   commands:
   - python3 standalone_tests/lazy_imports.py
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
   - pytest -v -s transformers_utils
+  - pytest -v -s config
 
 - label: Python-only Installation Test # 10min
   timeout_in_minutes: 20
@@ -187,7 +191,7 @@ steps:
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
@@ -215,7 +219,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
   - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -226,6 +230,27 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
   - popd
 
+- label: Distributed Tests (8 GPUs) # 4min
+  timeout_in_minutes: 10
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_8
+  # grade: Blocking
+  gpu: h100
+  num_gpus: 8
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - examples/offline_inference/torchrun_dp_example.py
+  - vllm/config/parallel.py
+  - vllm/distributed/
+  - vllm/v1/engine/llm_engine.py
+  - vllm/v1/executor/uniproc_executor.py
+  - vllm/v1/worker/gpu_worker.py
+  commands:
+  # https://github.com/NVIDIA/nccl/issues/1838
+  #- export NCCL_CUMEM_HOST_ENABLE=0
+  # test with torchrun tp=2 and dp=4 with ep
+  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
+
 - label: EPLB Algorithm Test # 5min
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -238,11 +263,11 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
 
-- label: EPLB Execution Test # 5min
+- label: EPLB Execution Test # 10min
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_4
   # grade: Blocking
-  timeout_in_minutes: 15
+  timeout_in_minutes: 20
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -250,6 +275,7 @@ steps:
   - tests/distributed/test_eplb_execute.py
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
+  - pytest -v -s distributed/test_eplb_spec_decode.py
 
 - label: Metrics, Tracing Test # 12min
   timeout_in_minutes: 20
@@ -273,7 +299,7 @@ steps:
 
 - label: Regression Test # 7min
   timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   agent_pool: mi325_1
   grade: Blocking
   source_file_dependencies:
@@ -284,23 +310,20 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  #grade: Blocking
+  # grade: Blocking
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenization
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
@@ -337,6 +360,7 @@ steps:
     - tests/v1
   commands:
     # split the test to avoid interference
+    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
     - pytest -v -s -m 'not cpu_test' v1/core
     - pytest -v -s v1/executor
     - pytest -v -s v1/kv_offload
@@ -344,7 +368,7 @@ steps:
     - pytest -v -s v1/logits_processors
     - pytest -v -s v1/worker
     - pytest -v -s v1/spec_decode
-    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_lmcache_integration.py
+    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
     - pytest -v -s -m 'not cpu_test' v1/metrics
     - pytest -v -s v1/test_oracle.py
     - pytest -v -s v1/test_request.py
@@ -353,6 +377,29 @@ steps:
     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
+# TODO: Add the "V1 Test attetion (MI300)" test group
+
+- label: V1 Test attention (H100) # 10min
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_1
+  # grade: Blocking
+  timeout_in_minutes: 30
+  gpu: h100
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - pytest -v -s v1/attention
+
+- label: V1 Test attention (B200) # 10min
+  timeout_in_minutes: 30
+  gpu: b200
+  source_file_dependencies:
+    - vllm/v1/attention
+    - tests/v1/attention
+  commands:
+    - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
+
 - label: V1 Test others (CPU) # 5 mins
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
@@ -456,17 +503,12 @@ steps:
     - vllm/
     - tests/compile
   commands:
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-  #  - pytest -v -s compile/test_sequence_parallelism.py
-  #  - pytest -v -s compile/test_async_tp.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -478,11 +520,14 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
-- label: PyTorch Fullgraph Test # 22min
-  timeout_in_minutes: 35
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
@@ -491,8 +536,23 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py
-  - pytest -v -s compile/test_fusions_e2e.py
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # Limit to no custom ops to reduce running time
+    # Wrap with quotes to escape yaml and avoid starting -k string with a -
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+
+- label: Cudagraph test
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_1
+  source_file_dependencies:
+  - tests/v1/cudagraph
+  - vllm/v1/cudagraph_dispatcher.py
+  - vllm/config/compilation.py
+  - vllm/compilation
+  commands:
+    - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
+    - pytest -v -s v1/cudagraph/test_cudagraph_mode.py
 
 - label: Kernels Core Operation Test # 48min
   timeout_in_minutes: 75
@@ -544,6 +604,8 @@ steps:
   - tests/kernels/moe
   - vllm/model_executor/layers/fused_moe/
   - vllm/distributed/device_communicators/
+  - vllm/envs.py
+  - vllm/config
   commands:
     - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 2
@@ -562,10 +624,13 @@ steps:
 
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
+  torch_nightly: true
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   source_file_dependencies:
+  - vllm/engine/arg_utils.py
+  - vllm/config/model.py
   - vllm/model_executor
   - tests/model_executor
   - tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -639,7 +704,7 @@ steps:
   - vllm/model_executor/models/whisper.py
   commands: # LMEval
   # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442
-  - pytest -s entrypoints/openai/correctness/  --ignore entrypoints/openai/correctness/test_transcription_api_correctness.py
+  - pytest -s entrypoints/openai/correctness/
 
 - label: OpenAI-Compatible Tool Use # 23 min
   timeout_in_minutes: 35
@@ -688,6 +753,7 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/model_executor/models/
+  - vllm/transformers_utils/
   - tests/models/test_initialization.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
@@ -861,9 +927,10 @@ steps:
     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 10min
+  timeout_in_minutes: 70
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
-  timeout_in_minutes: 15
+  # grade: Blocking
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - vllm/multimodal/
@@ -934,16 +1001,17 @@ steps:
 - label: Transformers Nightly Models Test
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
+  # grade: Blocking
   working_dir: "/vllm-workspace/"
   optional: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
     - pytest -v -s tests/models/test_transformers.py
-    - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py
+    # - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
     - python3 examples/offline_inference/basic/chat.py
-    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
@@ -961,11 +1029,16 @@ steps:
   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/attention/backends/mla/cutlass_mla.py
+  - vllm/v1/attention/backends/mla/flashinfer_mla.py
+  - vllm/platforms/cuda.py
+  - vllm/attention/selector.py
   commands:
     - nvidia-smi
     - python3 examples/offline_inference/basic/chat.py
     # Attention
     # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
+    - pytest -v -s tests/kernels/attention/test_attention_selector.py
     - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
     - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
     - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
@@ -983,7 +1056,7 @@ steps:
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
 
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200
@@ -1001,13 +1074,40 @@ steps:
     - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # Wrap with quotes to escape yaml
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/distributed/test_full_graph.py::test_fp8_kv_scale_compile
+
+- label: Blackwell Fusion E2E Tests # 30 min
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/
+  # can affect pattern matching
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - nvidia-smi
+    # Run all e2e fusion tests
     - pytest -v -s tests/compile/test_fusions_e2e.py
 
-- label: Blackwell GPT-OSS Eval
+- label: ROCm GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
-  gpu: b200
+  agent_pool: mi325_1
+  mirror_hardwares: [amdexperimental, amdproduction]
   optional: true # run on nightlies
   source_file_dependencies:
   - tests/evals/gpt_oss
@@ -1016,7 +1116,7 @@ steps:
   - vllm/v1/attention/backends/flashinfer.py
   commands:
     - uv pip install --system 'gpt-oss[eval]==0.0.5'
-    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
+    - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 
 - label: Blackwell Quantized MoE Test
   timeout_in_minutes: 60
@@ -1106,7 +1206,7 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/v1/engine/
   - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
@@ -1119,7 +1219,7 @@ steps:
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1219,7 +1319,10 @@ steps:
     - pytest -v -s -x lora/test_llama_tp.py
     - pytest -v -s -x lora/test_llm_with_multi_loras.py
     - pytest -v -s -x lora/test_olmoe_tp.py
-    - pytest -v -s -x lora/test_gptoss_tp.py
+
+    # Disabled for now because MXFP4 backend on non-cuda platform 
+    # doesn't support LoRA yet
+    #- pytest -v -s -x lora/test_gptoss_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min
@@ -1234,7 +1337,7 @@ steps:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt
 
 - label: Weight Loading Multiple GPU Test - Large Models # optional
   mirror_hardwares: [amdexperimental]
@@ -1242,17 +1345,17 @@ steps:
   # grade: Blocking
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
-  gpu: a100
   optional: true
   source_file_dependencies:
   - vllm/
   - tests/weight_loading
   commands:
-    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt
 
 - label: NixlConnector PD accuracy tests (Distributed) # 30min
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_4
+  # grade: Blocking
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -1267,6 +1370,9 @@ steps:
 ##### A100 test #####
 
 - label: Distributed Tests (A100) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
   gpu: a100
   optional: true
   num_gpus: 4
@@ -1281,6 +1387,9 @@ steps:
   - pytest -v -s -x lora/test_mixtral.py
 
 - label: LM Eval Large Models # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
   gpu: a100
   optional: true
   num_gpus: 4
@@ -1292,19 +1401,41 @@ steps:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 
+##### H100 test #####
+- label: LM Eval Large Models (H100) # optional
+  mirror_hardwares: [amdexperimental, amdproduction]
+  agent_pool: mi325_4
+  # grade: Blocking
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+    - export VLLM_USE_DEEP_GEMM=0  # We found Triton is faster than DeepGEMM for H100
+    - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
+
 ##### H200 test #####
 - label: Distributed Tests (H200) # optional
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_2
+  # grade: Blocking
   gpu: h200
   optional: true
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - pytest -v -s tests/compile/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1315,6 +1446,7 @@ steps:
   commands:
     - pytest -v -s tests/distributed/test_context_parallel.py
     - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
+    - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### RL Integration Tests #####
 - label: Prime-RL Integration Test # 15min
@@ -1330,3 +1462,27 @@ steps:
   - .buildkite/scripts/run-prime-rl-test.sh
   commands:
     - bash .buildkite/scripts/run-prime-rl-test.sh
+
+- label: DeepSeek V2-Lite Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy
+  mirror_hardwares: [amdexperimental]
+  agent_pool: mi325_4
+  # grade: Blocking
+  timeout_in_minutes: 60
+  gpu: h100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index be1b79ddc432..9f2107fb1e5a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -57,14 +57,15 @@ steps:
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
 
-- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 4 mins
-  timeout_in_minutes: 10
+- label: Async Engine, Inputs, Utils, Worker, Config Test (CPU) # 15min
+  timeout_in_minutes: 20
   source_file_dependencies:
   - vllm/
   - tests/test_inputs.py
   - tests/test_outputs.py
   - tests/multimodal
   - tests/standalone_tests/lazy_imports.py
+  - tests/tokenizers_
   - tests/transformers_utils
   - tests/config
   no_gpu: true
@@ -73,6 +74,7 @@ steps:
   - pytest -v -s test_inputs.py
   - pytest -v -s test_outputs.py
   - pytest -v -s -m 'cpu_test' multimodal
+  - pytest -v -s tokenizers_
   - pytest -v -s transformers_utils
   - pytest -v -s config
 
@@ -167,7 +169,7 @@ steps:
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
   - tests/distributed/test_events
-  - tests/compile/test_basic_correctness
+  - tests/compile/fullgraph/test_basic_correctness.py
   - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - tests/examples/offline_inference/data_parallel.py
@@ -192,12 +194,13 @@ steps:
   # test with internal dp
   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
   - pytest -v -s distributed/test_utils.py
-  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s compile/fullgraph/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s distributed/test_events.py
   - pytest -v -s distributed/test_symm_mem_allreduce.py
@@ -275,21 +278,18 @@ steps:
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
 
-- label: Engine Test # 25min
-  timeout_in_minutes: 40
+- label: Engine Test # 9min
+  timeout_in_minutes: 15
   mirror_hardwares: [amdexperimental]
   source_file_dependencies:
   - vllm/
   - tests/engine
-  - tests/tokenization
   - tests/test_sequence
   - tests/test_config
   - tests/test_logger
   - tests/test_vllm_port
   commands:
   - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
-  # OOM in the CI unless we run this separately
-  - pytest -v -s tokenization
 
 - label: V1 Test e2e + engine # 30min
   timeout_in_minutes: 45
@@ -346,6 +346,18 @@ steps:
   commands:
     - pytest -v -s v1/attention
 
+- label: Batch Invariance Tests (H100) # 10min
+  timeout_in_minutes: 25
+  gpu: h100
+  source_file_dependencies:
+    - vllm/
+    - tests/v1/determinism/
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pip install pytest-timeout pytest-forked
+    - pytest -v -s v1/determinism/test_batch_invariance.py
+    - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py
+
 - label: V1 Test attention (B200) # 10min
   timeout_in_minutes: 30
   gpu: b200
@@ -445,17 +457,12 @@ steps:
     - vllm/
     - tests/compile
   commands:
-    - pytest -v -s compile/test_config.py
-    - pytest -v -s compile/test_pass_manager.py
-    - pytest -v -s compile/test_fusion.py
-    - pytest -v -s compile/test_fusion_attn.py
-    - pytest -v -s compile/test_functionalization.py
-    - pytest -v -s compile/test_silu_mul_quant_fusion.py
-    - pytest -v -s compile/test_fusion_all_reduce.py
-    - pytest -v -s compile/test_decorator.py
-    - pytest -v -s compile/test_noop_elimination.py
-    - pytest -v -s compile/test_aot_compile.py
-    - pytest -v -s compile/test_qk_norm_rope_fusion.py
+  # Run unit tests defined directly under compile/,
+  # not including subdirectories, which are usually heavier
+  # tests covered elsewhere.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -465,9 +472,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_basic_correctness.py
-  - pytest -v -s compile/test_multimodal_compile.py
-  - pytest -v -s compile/piecewise/
+  # Run smoke tests under fullgraph directory, except test_full_graph.py
+  # as it is a heavy test that is covered in other steps.
+  # Use `find` to launch multiple instances of pytest so that
+  # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
+  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Test # 27min
   timeout_in_minutes: 40
@@ -477,10 +486,11 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+    # fp8 kv scales not supported on sm89, tested on Blackwell instead
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # Limit to no custom ops to reduce running time
     # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'"
+  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
 
 - label: Cudagraph test
   timeout_in_minutes: 20
@@ -552,6 +562,25 @@ steps:
   commands:
     - pytest -v -s kernels/mamba
 
+- label: Kernels DeepGEMM Test (H100)
+  timeout_in_minutes: 45
+  gpu: h100
+  num_gpus: 1
+  source_file_dependencies:
+  - tools/install_deepgemm.sh
+  - vllm/utils/deep_gemm.py
+  - vllm/model_executor/layers/fused_moe
+  - vllm/model_executor/layers/quantization
+  - tests/kernels/quantization/test_block_fp8.py
+  - tests/kernels/moe/test_deepgemm.py
+  - tests/kernels/moe/test_batched_deepgemm.py
+  - tests/kernels/attention/test_deepgemm_attention.py
+  commands:
+    - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm
+    - pytest -v -s kernels/moe/test_deepgemm.py
+    - pytest -v -s kernels/moe/test_batched_deepgemm.py
+    - pytest -v -s kernels/attention/test_deepgemm_attention.py
+
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
   torch_nightly: true
@@ -602,6 +631,7 @@ steps:
   # we can only upgrade after this is resolved
   # TODO(jerryzh168): resolve the above comment
   - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
+  - uv pip install --system conch-triton-kernels
   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 
 - label: LM Eval Small Models # 53min
@@ -662,6 +692,7 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/model_executor/models/
+  - vllm/transformers_utils/
   - tests/models/test_initialization.py
   commands:
     # Only when vLLM model source is modified - test initialization of a large
@@ -788,14 +819,24 @@ steps:
   commands:
     - pytest -v -s models/language/pooling_mteb_test
 
-- label: Multi-Modal Processor Test # 44min
+- label: Multi-Modal Processor Test (CPU)
+  timeout_in_minutes: 60
+  source_file_dependencies:
+  - vllm/
+  - tests/models/multimodal
+  no_gpu: true
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py
+
+- label: Multi-Modal Processor Test
   timeout_in_minutes: 60
   source_file_dependencies:
   - vllm/
   - tests/models/multimodal
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-    - pytest -v -s models/multimodal/processing
+    - pytest -v -s models/multimodal/processing/test_tensor_schema.py
 
 - label: Multi-Modal Models Test (Standard) # 60min
   timeout_in_minutes: 80
@@ -872,14 +913,15 @@ steps:
 - label: Transformers Nightly Models Test
   working_dir: "/vllm-workspace/"
   optional: true
+  soft_fail: true
   commands:
     - pip install --upgrade git+https://github.com/huggingface/transformers
-    - pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
+    - pytest -v -s tests/models/test_initialization.py
     - pytest -v -s tests/models/test_transformers.py
-    # - pytest -v -s tests/models/multimodal/processing/
-    - pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
     - python3 examples/offline_inference/basic/chat.py
-    # - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
     # Whisper needs spawn method to avoid deadlock
     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 
@@ -923,8 +965,9 @@ steps:
     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
+    - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
-- label: Blackwell Fusion Tests # 30 min
+- label: Blackwell Fusion and Compile Tests # 30 min
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   gpu: b200
@@ -932,20 +975,29 @@ steps:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
   - vllm/compilation/
   # can affect pattern matching
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_fusions_e2e.py
+  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - nvidia-smi
     - pytest -v -s tests/compile/test_fusion_attn.py
     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
     # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
     # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
     # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'"
+    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 - label: Blackwell Fusion E2E Tests # 30 min
   timeout_in_minutes: 40
@@ -962,14 +1014,11 @@ steps:
   - vllm/model_executor/layers/layernorm.py
   - vllm/model_executor/layers/activation.py
   - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusions_e2e.py
-  - tests/compile/test_full_graph.py
+  - tests/compile/distributed/test_fusions_e2e.py
   commands:
     - nvidia-smi
     # Run all e2e fusion tests
-    - pytest -v -s tests/compile/test_fusions_e2e.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
 
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
@@ -1067,7 +1116,7 @@ steps:
   - vllm/worker/worker_base.py
   - vllm/v1/engine/
   - vllm/v1/worker/
-  - tests/compile/test_basic_correctness.py
+  - tests/compile/fullgraph/test_basic_correctness.py
   - tests/compile/test_wrapper.py
   - tests/distributed/
   - tests/entrypoints/llm/test_collective_rpc.py
@@ -1079,10 +1128,11 @@ steps:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/llm/test_collective_rpc.py
-  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
@@ -1262,10 +1312,11 @@ steps:
   working_dir: "/vllm-workspace/"
   num_gpus: 2
   commands:
-    - pytest -v -s tests/compile/test_async_tp.py
-    - pytest -v -s tests/compile/test_sequence_parallelism.py
-    - pytest -v -s tests/compile/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
+    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
     - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -1302,11 +1353,20 @@ steps:
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
 
-- label: Qwen3-30B-A3B-FP8-block Accuracy
+- label: Qwen3-30B-A3B-FP8-block Accuracy (H100)
   timeout_in_minutes: 60
   gpu: h100
   optional: true
   num_gpus: 4
   working_dir: "/vllm-workspace"
   commands:
-  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020
+
+- label: Qwen3-30B-A3B-FP8-block Accuracy (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  optional: true
+  num_gpus: 2
+  working_dir: "/vllm-workspace"
+  commands:
+  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
\ No newline at end of file
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f26c782bccf2..ecb10d1a450f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,12 +3,13 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention @LucasWilkinson
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
+/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @njhill @22quinn
 /vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 /vllm/model_executor/layers/mamba @tdoublep
 /vllm/model_executor/model_loader @22quinn
+/vllm/model_executor/layers/batch_invariant.py @yewentao256 
 /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
 /vllm/vllm_flash_attn @LucasWilkinson
 /vllm/lora @jeejeelee
@@ -20,27 +21,30 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 
 # Any change to the VllmConfig changes can have a large user-facing impact,
 # so spam a lot of people
-/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
-/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
+/vllm/config @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
+/vllm/config/cache.py @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 
 # vLLM V1
 /vllm/v1/attention @LucasWilkinson
 /vllm/v1/attention/backends/mla @pavanimajety
 /vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 /vllm/v1/attention/backends/triton_attn.py @tdoublep
-/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /vllm/v1/sample @22quinn @houseroad @njhill
 /vllm/v1/spec_decode @benchislett @luccafong
 /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 /vllm/v1/kv_cache_interface.py @heheda12345
 /vllm/v1/offloading @ApostaC
 
+# Model runner V2
+/vllm/v1/worker/gpu @WoosukKwon
+
 # Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
+/.buildkite/lm-eval-harness @mgoin 
 /tests/distributed/test_multi_node_assignment.py @youkaichao
 /tests/distributed/test_pipeline_parallel.py @youkaichao
 /tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
+/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @aarnphm @NickLucche
 /tests/evals @mgoin
 /tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 /tests/models @DarkLight1337 @ywang96
@@ -49,15 +53,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/test_inputs.py @DarkLight1337 @ywang96
 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
+/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC
 /tests/weight_loading @mgoin @youkaichao @yewentao256
 /tests/lora @jeejeelee
 /tests/models/language/generation/test_hybrid.py @tdoublep
 /tests/v1/kv_connector/nixl_integration @NickLucche
 /tests/v1/kv_connector @ApostaC
 /tests/v1/offloading @ApostaC
+/tests/v1/determinism @yewentao256 
 
-# Transformers backend
+# Transformers modeling backend
 /vllm/model_executor/models/transformers @hmellor
 /tests/models/test_transformers.py @hmellor
 
@@ -144,6 +149,7 @@ mkdocs.yaml @hmellor
 /examples/*/pooling/ @noooop
 /tests/models/*/pooling* @noooop
 /tests/entrypoints/pooling @noooop
+/vllm/entrypoints/pooling @aarnphm @chaunceyjiang @noooop
 /vllm/config/pooler.py @noooop
 /vllm/pooling_params.py @noooop
 /vllm/model_executor/layers/pooler.py @noooop
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index c3e132a536a4..861290ea43c8 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Set up Python
         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
diff --git a/.github/workflows/issue_autolabel.yml b/.github/workflows/issue_autolabel.yml
index 7d565ef9f2e4..629966b95933 100644
--- a/.github/workflows/issue_autolabel.yml
+++ b/.github/workflows/issue_autolabel.yml
@@ -105,6 +105,31 @@ jobs:
                   }
                 ],
               },
+              cpu: {
+                // Keyword search - matches whole words only (with word boundaries)
+                keywords: [
+                  {
+                    term: "CPU Backend",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "x86",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "ARM",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "Apple Silicon",
+                    searchIn: "title"
+                  },
+                  {
+                    term: "IBM Z",
+                    searchIn: "title"
+                  },
+                ],
+              },
               // Add more label configurations here as needed
               // example: {
               //   keywords: [...],
diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml
new file mode 100644
index 000000000000..3a12c4b3a830
--- /dev/null
+++ b/.github/workflows/macos-smoke-test.yml
@@ -0,0 +1,80 @@
+name: macOS Apple Silicon Smoke Test
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:  # Manual trigger
+
+jobs:
+  macos-m1-smoke-test:
+    runs-on: macos-latest
+    timeout-minutes: 30
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - uses: astral-sh/setup-uv@v7
+        with:
+          enable-cache: true
+          cache-dependency-glob: |
+            requirements/**/*.txt
+            pyproject.toml
+          python-version: '3.12'
+
+      - name: Create virtual environment
+        run: |
+          uv venv
+          echo "$GITHUB_WORKSPACE/.venv/bin" >> "$GITHUB_PATH"
+
+      - name: Install dependencies and build vLLM
+        run: |
+          uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
+          uv pip install -e .
+        env:
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+
+      - name: Verify installation
+        run: |
+          python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
+
+      - name: Smoke test vllm serve
+        run: |
+          # Start server in background
+          vllm serve Qwen/Qwen3-0.6B \
+            --max-model-len=2K \
+            --load-format=dummy \
+            --hf-overrides '{"num_hidden_layers": 2}' \
+            --enforce-eager \
+            --port 8000 &
+
+          SERVER_PID=$!
+
+          # Wait for server to start
+          for i in {1..30}; do
+            if curl -s http://localhost:8000/health > /dev/null; then
+              echo "Server started successfully"
+              break
+            fi
+            if [ "$i" -eq 30 ]; then
+              echo "Server failed to start"
+              kill "$SERVER_PID"
+              exit 1
+            fi
+            sleep 2
+          done
+
+          # Test health endpoint
+          curl -f http://localhost:8000/health
+
+          # Test completion
+          curl -f http://localhost:8000/v1/completions \
+            -H "Content-Type: application/json" \
+            -d '{
+              "model": "Qwen/Qwen3-0.6B",
+              "prompt": "Hello",
+              "max_tokens": 5
+            }'
+
+          # Cleanup
+          kill "$SERVER_PID"
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index e21d13b8161f..d5e70f30ef63 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -16,7 +16,7 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
         python-version: "3.12"
diff --git a/.gitignore b/.gitignore
index 50070d7898fe..7cda86478664 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,9 @@
 # vllm-flash-attn built from source
 vllm/vllm_flash_attn/*
 
+# OpenAI triton kernels copied from source
+vllm/third_party/triton_kernels/*
+
 # triton jit
 .triton
 
diff --git a/.markdownlint.yaml b/.markdownlint.yaml
index cd9df57cd980..937487f47364 100644
--- a/.markdownlint.yaml
+++ b/.markdownlint.yaml
@@ -3,10 +3,9 @@ MD007:
 MD013: false
 MD024:
   siblings_only: true
+MD031:
+  list_items: false
 MD033: false
-MD045: false
 MD046: false
-MD051: false
 MD052: false
-MD053: false
 MD059: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcc44be87e55..e09972fe7199 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,7 +136,7 @@ elseif(HIP_FOUND)
 
   # ROCm 5.X and 6.X
   if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND
-      NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM})
+      Torch_VERSION VERSION_LESS ${TORCH_SUPPORTED_VERSION_ROCM})
     message(WARNING "Pytorch version >= ${TORCH_SUPPORTED_VERSION_ROCM} "
       "expected for ROCm build, saw ${Torch_VERSION} instead.")
   endif()
@@ -307,7 +307,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 
   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
-  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
+  set(CUTLASS_REVISION "v4.2.1")
 
   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@@ -354,8 +354,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # Only build Marlin kernels if we are building for at least some compatible archs.
   # Keep building Marlin for 9.0 as there are some group sizes and shapes that
   # are not supported by Machete yet.
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+
+  # marlin arches for fp16 output
+  cuda_archs_loose_intersection(MARLIN_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for bf16 output (we need 9.0 for bf16 atomicAdd PTX)
+  cuda_archs_loose_intersection(MARLIN_BF16_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
+
   if (MARLIN_ARCHS)
 
     #
@@ -365,16 +374,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(MARLIN_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
     file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
 
-    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
 
-    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
         PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE marlin_generation_result
         OUTPUT_VARIABLE marlin_generation_result
         OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
@@ -387,15 +398,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
       else()
-        set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
-            CACHE STRING "Last run Marlin generate script hash" FORCE)
+        set(MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+            CACHE STRING "Last run Marlin generate script hash and arch" FORCE)
         message(STATUS "Marlin generation completed successfully.")
       endif()
     else()
       message(STATUS "Marlin generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
+    file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_float16.cu")
     set_gencode_flags_for_srcs(
       SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
       CUDA_ARCHS "${MARLIN_ARCHS}")
@@ -403,12 +414,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
-
     list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
 
+    file(GLOB MARLIN_TEMPLATE_BF16_KERNEL_SRC "csrc/quantization/gptq_marlin/sm80_kernel_*_bfloat16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${MARLIN_TEMPLATE_BF16_KERNEL_SRC}"
+      CUDA_ARCHS "${MARLIN_BF16_ARCHS}")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+      set_source_files_properties(${MARLIN_TEMPLATE_BF16_KERNEL_SRC}
+        PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+    endif()
+    list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_BF16_KERNEL_SRC})
+
+    if (MARLIN_FP8_ARCHS) 
+      file(GLOB MARLIN_TEMPLATE_FP8_KERNEL_SRC "csrc/quantization/gptq_marlin/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_TEMPLATE_FP8_KERNEL_SRC}"
+        CUDA_ARCHS "${MARLIN_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_TEMPLATE_FP8_KERNEL_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_FP8_KERNEL_SRC})
+    endif()
+
     set(MARLIN_SRCS
        "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+       "csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu"
        "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
        "csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
     set_gencode_flags_for_srcs(
@@ -512,9 +545,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
   # require CUDA 12.8 or later
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS
@@ -604,12 +637,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(SRCS
       "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
       "csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
-      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
+      "csrc/quantization/fp4/nvfp4_experts_quant.cu"
+      "csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu"
+      "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${FP4_ARCHS}")
     list(APPEND VLLM_EXT_SRC "${SRCS}")
     list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM120=1")
     message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
   else()
     message(STATUS "Not building NVFP4 as no compatible archs were found.")
@@ -619,9 +655,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 
   # FP4 Archs and flags
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
     set(SRCS
@@ -695,7 +731,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
@@ -741,9 +777,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
   else()
-    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
   endif()
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
     set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
@@ -861,7 +897,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
   # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
   if(HADACORE_ARCHS)
     set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
     set_gencode_flags_for_srcs(
@@ -938,8 +974,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     CUDA_ARCHS "${CUDA_ARCHS}")
 
   list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
-  # 9.0 for latest bf16 atomicAdd PTX
-  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches
+  # note that we always set `use_atomic_add=False` for moe marlin now,
+  # so we don't need 9.0 for bf16 atomicAdd PTX
+  cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0+PTX" "${CUDA_ARCHS}")
+  # moe marlin arches for fp8 input
+  # - sm80 doesn't support fp8 computation
+  # - sm90 and sm100 don't support QMMA.16832.F32.E4M3.E4M3 SAAS instruction
+  # so we only enable fp8 computation for SM89 (e.g. RTX 40x0)  and 12.0 (e.g. RTX 50x0)
+  cuda_archs_loose_intersection(MARLIN_MOE_FP8_ARCHS "8.9;12.0" "${CUDA_ARCHS}")
   if (MARLIN_MOE_ARCHS)
 
     #
@@ -949,16 +992,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     set(MOE_MARLIN_GEN_SCRIPT
       ${CMAKE_CURRENT_SOURCE_DIR}/csrc/moe/marlin_moe_wna16/generate_kernels.py)
     file(MD5 ${MOE_MARLIN_GEN_SCRIPT} MOE_MARLIN_GEN_SCRIPT_HASH)
+    list(JOIN CUDA_ARCHS "," CUDA_ARCHS_STR)
+    set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH "${MOE_MARLIN_GEN_SCRIPT_HASH}(ARCH:${CUDA_ARCHS_STR})")
 
-    message(STATUS "Marlin MOE generation script hash: ${MOE_MARLIN_GEN_SCRIPT_HASH}")
-    message(STATUS "Last run Marlin MOE generate script hash: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}")
+    message(STATUS "Marlin MOE generation script hash with arch: ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
+    message(STATUS "Last run Marlin MOE generate script hash with arch: $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}")
 
-    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH}
-        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
+    if (NOT DEFINED CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
+        OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH})
       execute_process(
         COMMAND ${CMAKE_COMMAND} -E env
         PYTHONPATH=$PYTHONPATH
-          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
+          ${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT} ${CUDA_ARCHS_STR}
         RESULT_VARIABLE moe_marlin_generation_result
         OUTPUT_VARIABLE moe_marlin_generation_output
         OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log
@@ -971,7 +1016,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                             "\nCheck the log for details: "
                             "${CMAKE_CURRENT_BINARY_DIR}/moe_marlin_generation.log")
       else()
-        set(MOE_MARLIN_GEN_SCRIPT_HASH ${MOE_MARLIN_GEN_SCRIPT_HASH}
+        set(MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH ${MOE_MARLIN_GEN_SCRIPT_HASH_AND_ARCH}
             CACHE STRING "Last run Marlin MOE generate script hash" FORCE)
         message(STATUS "Marlin MOE generation completed successfully.")
       endif()
@@ -979,16 +1024,28 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       message(STATUS "Marlin MOE generation script has not changed, skipping generation.")
     endif()
 
-    file(GLOB MOE_WNAA16_MARLIN_SRC "csrc/moe/marlin_moe_wna16/*.cu")
+    file(GLOB MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/sm80_kernel_*.cu")
+    list(APPEND MARLIN_MOE_SRC "csrc/moe/marlin_moe_wna16/ops.cu")
     set_gencode_flags_for_srcs(
-      SRCS "${MOE_WNAA16_MARLIN_SRC}"
+      SRCS "${MARLIN_MOE_SRC}"
       CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
     if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
-      set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
+      set_source_files_properties(${MARLIN_MOE_SRC}
         PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
     endif()
-
-    list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})
+    list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_SRC})
+
+    if (MARLIN_MOE_FP8_ARCHS)
+      file(GLOB MARLIN_MOE_FP8_SRC "csrc/moe/marlin_moe_wna16/sm89_kernel_*.cu")
+      set_gencode_flags_for_srcs(
+        SRCS "${MARLIN_MOE_FP8_SRC}"
+        CUDA_ARCHS "${MARLIN_MOE_FP8_ARCHS}")
+      if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
+        set_source_files_properties(${MARLIN_MOE_FP8_SRC}
+          PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
+      endif()
+      list(APPEND VLLM_MOE_EXT_SRC ${MARLIN_MOE_FP8_SRC})
+    endif()
 
     message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}")
   else()
@@ -1030,6 +1087,11 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
     WITH_SOABI)
 endif()
 
+# For CUDA and HIP builds also build the triton_kernels external package.
+if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
+    include(cmake/external_projects/triton_kernels.cmake)
+endif()
+
 # For CUDA we also build and ship some external projects.
 if (VLLM_GPU_LANG STREQUAL "CUDA")
     include(cmake/external_projects/flashmla.cmake)
diff --git a/README.md b/README.md
index 033e1035d891..abbb63158f16 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 
 *Latest News* 🔥
 
+- [2025/11] We hosted [vLLM Bangkok Meetup](https://luma.com/v0f647nv). We explored vLLM and LMCache inference and low-resource language adaptation with speakers from Embedded LLM, AMD, and Red Hat. Please find the meetup slides [here](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing).
 - [2025/11] We hosted [the first vLLM Europe Meetup in Zurich](https://luma.com/0gls27kb) focused on quantization, distributed inference, and reinforcement learning at scale with speakers from Mistral, IBM, and Red Hat. Please find the meetup slides [here](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) and recording [here](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
 - [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4021fede7215..d69d74ca61f5 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -620,7 +620,7 @@ def get_tokenizer(
         kwargs["use_fast"] = False
     if tokenizer_mode == "mistral":
         try:
-            from vllm.transformers_utils.tokenizer import MistralTokenizer
+            from vllm.tokenizers import MistralTokenizer
         except ImportError as e:
             raise ImportError(
                 "MistralTokenizer requires vllm package.\n"
diff --git a/benchmarks/benchmark_batch_invariance.py b/benchmarks/benchmark_batch_invariance.py
new file mode 100755
index 000000000000..b5c16c42de46
--- /dev/null
+++ b/benchmarks/benchmark_batch_invariance.py
@@ -0,0 +1,380 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Benchmark to measure the performance overhead of VLLM_BATCH_INVARIANT mode.
+
+This benchmark runs the same workload twice:
+1. With VLLM_BATCH_INVARIANT=0 (baseline)
+2. With VLLM_BATCH_INVARIANT=1 (batch invariant mode)
+
+And reports the timing and throughput metrics for comparison.
+
+Environment variables:
+    VLLM_BENCH_MODEL: Model to benchmark (default: "Qwen/Qwen3-1.7B")
+    VLLM_BENCH_TP_SIZE: Tensor parallel size (default: 1, use 8 for deepseek)
+    VLLM_BENCH_BATCH_SIZE: Max batch size (default: 128)
+    VLLM_BENCH_NUM_TRIALS: Number of trials to run (default: 5)
+    VLLM_BENCH_MIN_PROMPT: Min prompt length in words (default: 1024)
+    VLLM_BENCH_MAX_PROMPT: Max prompt length in words (default: 2048)
+    VLLM_BENCH_MAX_TOKENS: Max tokens to generate (default: 128)
+    VLLM_BENCH_TEMPERATURE: Temperature for sampling (default: 0.0)
+    VLLM_BENCH_GPU_MEMORY_UTILIZATION: GPU memory utilization (default: 0.4)
+    VLLM_BENCH_MAX_MODEL_LEN: Max model length (default: 5120)
+    VLLM_BENCH_BACKEND: Attention backend (default: FLASH_ATTN)
+
+Example usage:
+    # Benchmark qwen3 (default)
+    python benchmarks/benchmark_batch_invariance.py
+
+    # Benchmark deepseek with 8 GPUs
+    VLLM_BENCH_MODEL="deepseek-ai/DeepSeek-V3" VLLM_BENCH_TP_SIZE=8 \\
+        python benchmarks/benchmark_batch_invariance.py
+
+    # Quick test with fewer trials
+    VLLM_BENCH_NUM_TRIALS=2 VLLM_BENCH_BATCH_SIZE=32 \\
+        python benchmarks/benchmark_batch_invariance.py
+"""
+
+import contextlib
+import os
+import random
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.platforms import current_platform
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    """Generate a random prompt for benchmarking."""
+    prompt_templates = [
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def run_benchmark_with_batch_invariant(
+    model: str,
+    tp_size: int,
+    max_batch_size: int,
+    num_trials: int,
+    min_prompt: int,
+    max_prompt: int,
+    max_tokens: int,
+    temperature: float,
+    gpu_mem_util: float,
+    max_model_len: int,
+    backend: str,
+    batch_invariant: bool,
+    seed: int = 12345,
+) -> dict:
+    """
+    Run the benchmark with the specified configuration.
+
+    Returns a dict with timing and throughput metrics.
+    """
+    random.seed(seed)
+
+    # Set environment variables
+    os.environ["VLLM_ATTENTION_BACKEND"] = backend
+    if batch_invariant:
+        os.environ["VLLM_BATCH_INVARIANT"] = "1"
+    else:
+        os.environ["VLLM_BATCH_INVARIANT"] = "0"
+
+    print(f"\n{'=' * 80}")
+    print(f"BENCHMARK: VLLM_BATCH_INVARIANT={int(batch_invariant)}")
+    print(f"  Model: {model}")
+    print(f"  TP Size: {tp_size}")
+    print(f"  Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Trials: {num_trials}")
+    print(f"  Max Tokens: {max_tokens}")
+    print(f"{'=' * 80}\n")
+
+    sampling = SamplingParams(
+        temperature=temperature,
+        top_p=0.95,
+        max_tokens=max_tokens,
+        seed=20240919,
+    )
+
+    needle_prompt = "There once was a "
+
+    llm = None
+    try:
+        # Create LLM engine
+        start_init = time.perf_counter()
+        llm = LLM(
+            model=model,
+            max_num_seqs=max_batch_size,
+            gpu_memory_utilization=gpu_mem_util,
+            max_model_len=max_model_len,
+            dtype="bfloat16",
+            tensor_parallel_size=tp_size,
+            enable_prefix_caching=False,
+        )
+        init_time = time.perf_counter() - start_init
+        print(f"Engine initialization time: {init_time:.2f}s\n")
+
+        # Generate baseline
+        print("Generating baseline (warmup)...")
+        baseline_out = llm.generate([needle_prompt], sampling)
+        assert len(baseline_out) == 1
+        baseline_text = baseline_out[0].outputs[0].text
+        print(f"Baseline output: '{baseline_text[:50]}...'\n")
+
+        # Run trials and measure timing
+        trial_times: list[float] = []
+        total_tokens = 0
+        total_prompts = 0
+
+        for trial in range(num_trials):
+            # Create a batch
+            prompts: list[str] = []
+            batch_size = random.randint(max_batch_size // 2, max_batch_size)
+            needle_pos = random.randint(0, batch_size - 1)
+            for i in range(batch_size):
+                if i == needle_pos:
+                    prompts.append(needle_prompt)
+                else:
+                    prompts.append(_random_prompt(min_prompt, max_prompt))
+
+            # Measure time for this trial
+            start_time = time.perf_counter()
+            outputs = llm.generate(prompts, sampling)
+            trial_time = time.perf_counter() - start_time
+
+            trial_times.append(trial_time)
+            total_prompts += len(prompts)
+
+            # Count tokens
+            for output in outputs:
+                if output.outputs:
+                    total_tokens += len(output.outputs[0].token_ids)
+
+            print(
+                f"Trial {trial + 1}/{num_trials}: "
+                f"batch_size={batch_size}, "
+                f"time={trial_time:.2f}s"
+            )
+
+            # Verify needle output still matches
+            needle_output = outputs[needle_pos]
+            assert needle_output.prompt == needle_prompt
+
+        # Compute statistics
+        avg_time = sum(trial_times) / len(trial_times)
+        min_time = min(trial_times)
+        max_time = max(trial_times)
+        throughput = total_tokens / sum(trial_times)
+        prompts_per_sec = total_prompts / sum(trial_times)
+
+        print(f"\n{'=' * 80}")
+        print("RESULTS:")
+        print(f"  Average time per trial: {avg_time:.2f}s")
+        print(f"  Min time: {min_time:.2f}s")
+        print(f"  Max time: {max_time:.2f}s")
+        print(f"  Total tokens generated: {total_tokens}")
+        print(f"  Total prompts processed: {total_prompts}")
+        print(f"  Throughput: {throughput:.2f} tokens/s")
+        print(f"  Prompts/s: {prompts_per_sec:.2f}")
+        print(f"{'=' * 80}\n")
+
+        return {
+            "init_time": init_time,
+            "avg_time": avg_time,
+            "min_time": min_time,
+            "max_time": max_time,
+            "total_tokens": total_tokens,
+            "total_prompts": total_prompts,
+            "throughput": throughput,
+            "prompts_per_sec": prompts_per_sec,
+            "trial_times": trial_times,
+        }
+
+    finally:
+        # Cleanup
+        if llm is not None:
+            with contextlib.suppress(Exception):
+                llm.shutdown()
+
+
+def main():
+    # Check platform support
+    if not (current_platform.is_cuda() and current_platform.has_device_capability(90)):
+        print("ERROR: Requires CUDA and >= Hopper (SM90)")
+        print(f"Current platform: {current_platform.device_type}")
+        if current_platform.is_cuda():
+            print(f"Device capability: {current_platform.get_device_capability()}")
+        return 1
+
+    # Read configuration from environment
+    model = os.getenv("VLLM_BENCH_MODEL", "Qwen/Qwen3-1.7B")
+    tp_size = int(os.getenv("VLLM_BENCH_TP_SIZE", "1"))
+    max_batch_size = int(os.getenv("VLLM_BENCH_BATCH_SIZE", "128"))
+    num_trials = int(os.getenv("VLLM_BENCH_NUM_TRIALS", "5"))
+    min_prompt = int(os.getenv("VLLM_BENCH_MIN_PROMPT", "1024"))
+    max_prompt = int(os.getenv("VLLM_BENCH_MAX_PROMPT", "2048"))
+    max_tokens = int(os.getenv("VLLM_BENCH_MAX_TOKENS", "128"))
+    temperature = float(os.getenv("VLLM_BENCH_TEMPERATURE", "0.0"))
+    gpu_mem_util = float(os.getenv("VLLM_BENCH_GPU_MEMORY_UTILIZATION", "0.4"))
+    max_model_len = int(os.getenv("VLLM_BENCH_MAX_MODEL_LEN", "5120"))
+    backend = os.getenv("VLLM_BENCH_BACKEND", "FLASH_ATTN")
+
+    print("\n" + "=" * 80)
+    print("VLLM BATCH INVARIANCE BENCHMARK")
+    print("=" * 80)
+    print("\nConfiguration:")
+    print(f"  Model: {model}")
+    print(f"  Tensor Parallel Size: {tp_size}")
+    print(f"  Attention Backend: {backend}")
+    print(f"  Max Batch Size: {max_batch_size}")
+    print(f"  Number of Trials: {num_trials}")
+    print(f"  Prompt Length Range: {min_prompt}-{max_prompt} words")
+    print(f"  Max Tokens to Generate: {max_tokens}")
+    print(f"  Temperature: {temperature}")
+    print(f"  GPU Memory Utilization: {gpu_mem_util}")
+    print(f"  Max Model Length: {max_model_len}")
+    print("=" * 80)
+
+    # Run benchmark WITHOUT batch invariance (baseline)
+    print("\n" + "=" * 80)
+    print("PHASE 1: Running WITHOUT batch invariance (baseline)")
+    print("=" * 80)
+    baseline_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=False,
+    )
+
+    # Run benchmark WITH batch invariance
+    print("\n" + "=" * 80)
+    print("PHASE 2: Running WITH batch invariance")
+    print("=" * 80)
+    batch_inv_results = run_benchmark_with_batch_invariant(
+        model=model,
+        tp_size=tp_size,
+        max_batch_size=max_batch_size,
+        num_trials=num_trials,
+        min_prompt=min_prompt,
+        max_prompt=max_prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        gpu_mem_util=gpu_mem_util,
+        max_model_len=max_model_len,
+        backend=backend,
+        batch_invariant=True,
+    )
+
+    # Compare results
+    print("\n" + "=" * 80)
+    print("COMPARISON: Batch Invariance vs Baseline")
+    print("=" * 80)
+
+    init_overhead_pct = (
+        (batch_inv_results["init_time"] - baseline_results["init_time"])
+        / baseline_results["init_time"]
+        * 100
+    )
+    time_overhead_pct = (
+        (batch_inv_results["avg_time"] - baseline_results["avg_time"])
+        / baseline_results["avg_time"]
+        * 100
+    )
+    throughput_change_pct = (
+        (batch_inv_results["throughput"] - baseline_results["throughput"])
+        / baseline_results["throughput"]
+        * 100
+    )
+
+    print("\nInitialization Time:")
+    print(f"  Baseline:         {baseline_results['init_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['init_time']:.2f}s")
+    print(f"  Overhead:         {init_overhead_pct:+.2f}%")
+
+    print("\nAverage Trial Time:")
+    print(f"  Baseline:         {baseline_results['avg_time']:.2f}s")
+    print(f"  Batch Invariant:  {batch_inv_results['avg_time']:.2f}s")
+    print(f"  Overhead:         {time_overhead_pct:+.2f}%")
+
+    print("\nThroughput (tokens/s):")
+    print(f"  Baseline:         {baseline_results['throughput']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['throughput']:.2f}")
+    print(f"  Change:           {throughput_change_pct:+.2f}%")
+
+    print("\nPrompts/s:")
+    print(f"  Baseline:         {baseline_results['prompts_per_sec']:.2f}")
+    print(f"  Batch Invariant:  {batch_inv_results['prompts_per_sec']:.2f}")
+
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    if time_overhead_pct > 0:
+        print(
+            f"Batch invariance mode adds approximately {time_overhead_pct:.1f}% "
+            "overhead"
+        )
+    else:
+        print(
+            f"Batch invariance mode is approximately {-time_overhead_pct:.1f}% "
+            "faster (unexpected!)"
+        )
+
+    if abs(throughput_change_pct) < 1.0:
+        print("Throughput difference is negligible (< 1%)")
+    elif throughput_change_pct < 0:
+        print(
+            f"Throughput decreased by {-throughput_change_pct:.1f}% "
+            "with batch invariance"
+        )
+    else:
+        print(
+            f"Throughput increased by {throughput_change_pct:.1f}% "
+            "with batch invariance (unexpected!)"
+        )
+
+    print("=" * 80 + "\n")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 146c268a6b7f..28fc383a318d 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -69,7 +69,7 @@ def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
 
     # Remove the special tokens.
     return random.choices(
-        [v for k, v in vocab.items() if k not in all_special_ids],
+        [v for v in vocab.values() if v not in all_special_ids],
         k=length,
     )
 
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 904f80534914..d072c03c440b 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -5,11 +5,12 @@
 import asyncio
 import logging
 import os
+import time
+import uuid
+from urllib.parse import urlparse
 
 import aiohttp
 from quart import Quart, Response, make_response, request
-from rate_limiter import RateLimiter
-from request_queue import RequestQueue
 
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -24,26 +25,8 @@ def parse_args():
     parser.add_argument(
         "--timeout",
         type=float,
-        default=300,
-        help="Timeout for backend service requests in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-concurrent",
-        type=int,
-        default=100,
-        help="Maximum concurrent requests to backend services (default: 100)",
-    )
-    parser.add_argument(
-        "--queue-size",
-        type=int,
-        default=500,
-        help="Maximum number of requests in the queue (default: 500)",
-    )
-    parser.add_argument(
-        "--rate-limit",
-        type=int,
-        default=40,
-        help="Maximum requests per second (default: 40)",
+        default=6 * 60 * 60,
+        help="Timeout for backend service requests in seconds (default: 21600)",
     )
     parser.add_argument(
         "--port",
@@ -54,14 +37,32 @@ def parse_args():
     parser.add_argument(
         "--prefill-url",
         type=str,
-        default="http://localhost:8100/v1/completions",
-        help="Prefill service endpoint URL",
+        default="http://localhost:8100",
+        help="Prefill service base URL (protocol + host[:port])",
     )
     parser.add_argument(
         "--decode-url",
         type=str,
-        default="http://localhost:8200/v1/completions",
-        help="Decode service endpoint URL",
+        default="http://localhost:8200",
+        help="Decode service base URL (protocol + host[:port])",
+    )
+    parser.add_argument(
+        "--kv-host",
+        type=str,
+        default="localhost",
+        help="Hostname or IP used by KV transfer (default: localhost)",
+    )
+    parser.add_argument(
+        "--prefill-kv-port",
+        type=int,
+        default=14579,
+        help="Prefill KV port (default: 14579)",
+    )
+    parser.add_argument(
+        "--decode-kv-port",
+        type=int,
+        default=14580,
+        help="Decode KV port (default: 14580)",
     )
 
     return parser.parse_args()
@@ -73,70 +74,129 @@ def main():
 
     # Initialize configuration using command line parameters
     AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
-    MAX_CONCURRENT_REQUESTS = args.max_concurrent
-    REQUEST_QUEUE_SIZE = args.queue_size
-    RATE_LIMIT = args.rate_limit
     PREFILL_SERVICE_URL = args.prefill_url
     DECODE_SERVICE_URL = args.decode_url
     PORT = args.port
 
-    app = Quart(__name__)
+    PREFILL_KV_ADDR = f"{args.kv_host}:{args.prefill_kv_port}"
+    DECODE_KV_ADDR = f"{args.kv_host}:{args.decode_kv_port}"
 
-    # Initialize the rate limiter and request queue
-    rate_limiter = RateLimiter(RATE_LIMIT)
-    request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
+    logger.info(
+        "Proxy resolved KV addresses -> prefill: %s, decode: %s",
+        PREFILL_KV_ADDR,
+        DECODE_KV_ADDR,
+    )
+
+    app = Quart(__name__)
 
-    # Attach the configuration object to the application instance
+    # Attach the configuration object to the application instance so helper
+    # coroutines can read the resolved backend URLs and timeouts without using
+    # globals.
     app.config.update(
         {
             "AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
-            "rate_limiter": rate_limiter,
-            "request_queue": request_queue,
             "PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
             "DECODE_SERVICE_URL": DECODE_SERVICE_URL,
+            "PREFILL_KV_ADDR": PREFILL_KV_ADDR,
+            "DECODE_KV_ADDR": DECODE_KV_ADDR,
         }
     )
 
-    # Start queue processing on app startup
-    @app.before_serving
-    async def startup():
-        """Start request processing task when app starts serving"""
-        asyncio.create_task(request_queue.process())
-
-    async def forward_request(url, data):
-        """Forward request to backend service with rate limiting and error handling"""
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-
-        # Use rate limiter as context manager
-        async with (
-            rate_limiter,
-            aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
-        ):
-            try:
-                async with session.post(
-                    url=url, json=data, headers=headers
-                ) as response:
-                    if response.status == 200:
-                        # Stream response chunks
-                        async for chunk_bytes in response.content.iter_chunked(1024):
-                            yield chunk_bytes
-                    else:
-                        # Handle backend service errors
-                        error_text = await response.text()
-                        logger.error(
-                            "Backend service error: %s - %s",
-                            response.status,
-                            error_text,
-                        )
-                        yield b'{"error": "Backend service error"}'
-            except aiohttp.ClientError as e:
-                # Handle connection errors
-                logger.error("Connection error to %s: %s", url, str(e))
-                yield b'{"error": "Service unavailable"}'
-            except asyncio.TimeoutError:
-                # Handle timeout errors
-                logger.error("Timeout connecting to %s", url)
-                yield b'{"error": "Service timeout"}'
+    def _normalize_base_url(url: str) -> str:
+        """Remove any trailing slash so path joins behave predictably."""
+        return url.rstrip("/")
+
+    def _get_host_port(url: str) -> str:
+        """Return the hostname:port portion for logging and KV headers."""
+        parsed = urlparse(url)
+        host = parsed.hostname or "localhost"
+        port = parsed.port
+        if port is None:
+            port = 80 if parsed.scheme == "http" else 443
+        return f"{host}:{port}"
+
+    PREFILL_BASE = _normalize_base_url(PREFILL_SERVICE_URL)
+    DECODE_BASE = _normalize_base_url(DECODE_SERVICE_URL)
+    KV_TARGET = _get_host_port(DECODE_SERVICE_URL)
+
+    def _build_headers(request_id: str) -> dict[str, str]:
+        """Construct the headers expected by vLLM's P2P disagg connector."""
+        headers: dict[str, str] = {"X-Request-Id": request_id, "X-KV-Target": KV_TARGET}
+        api_key = os.environ.get("OPENAI_API_KEY")
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+        return headers
+
+    async def _run_prefill(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{PREFILL_BASE}{request_path}"
+        start_ts = time.perf_counter()
+        logger.info("[prefill] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    raise RuntimeError(
+                        f"Prefill backend error {resp.status}: {error_text}"
+                    )
+                await resp.read()
+                logger.info(
+                    "[prefill] done request_id=%s status=%s elapsed=%.2fs",
+                    request_id,
+                    resp.status,
+                    time.perf_counter() - start_ts,
+                )
+        except asyncio.TimeoutError as exc:
+            raise RuntimeError(f"Prefill service timeout at {url}") from exc
+        except aiohttp.ClientError as exc:
+            raise RuntimeError(f"Prefill service unavailable at {url}") from exc
+
+    async def _stream_decode(
+        request_path: str,
+        payload: dict,
+        headers: dict[str, str],
+        request_id: str,
+    ):
+        url = f"{DECODE_BASE}{request_path}"
+        # Stream tokens from the decode service once the prefill stage has
+        # materialized KV caches on the target workers.
+        logger.info("[decode] start request_id=%s url=%s", request_id, url)
+        try:
+            async with (
+                aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
+                session.post(url=url, json=payload, headers=headers) as resp,
+            ):
+                if resp.status != 200:
+                    error_text = await resp.text()
+                    logger.error(
+                        "Decode backend error %s - %s", resp.status, error_text
+                    )
+                    err_msg = (
+                        '{"error": "Decode backend error ' + str(resp.status) + '"}'
+                    )
+                    yield err_msg.encode()
+                    return
+                logger.info(
+                    "[decode] streaming response request_id=%s status=%s",
+                    request_id,
+                    resp.status,
+                )
+                async for chunk_bytes in resp.content.iter_chunked(1024):
+                    yield chunk_bytes
+                logger.info("[decode] finished streaming request_id=%s", request_id)
+        except asyncio.TimeoutError:
+            logger.error("Decode service timeout at %s", url)
+            yield b'{"error": "Decode service timeout"}'
+        except aiohttp.ClientError as exc:
+            logger.error("Decode service error at %s: %s", url, exc)
+            yield b'{"error": "Decode service unavailable"}'
 
     async def process_request():
         """Process a single request through prefill and decode stages"""
@@ -146,13 +206,27 @@ async def process_request():
             # Create prefill request (max_tokens=1)
             prefill_request = original_request_data.copy()
             prefill_request["max_tokens"] = 1
+            if "max_completion_tokens" in prefill_request:
+                prefill_request["max_completion_tokens"] = 1
 
             # Execute prefill stage
-            async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
-                continue
+            # The request id encodes both KV socket addresses so the backend can
+            # shuttle tensors directly via NCCL once the prefill response
+            # completes.
+            request_id = (
+                f"___prefill_addr_{PREFILL_KV_ADDR}___decode_addr_"
+                f"{DECODE_KV_ADDR}_{uuid.uuid4().hex}"
+            )
+
+            headers = _build_headers(request_id)
+            await _run_prefill(request.path, prefill_request, headers, request_id)
 
             # Execute decode stage and stream response
-            generator = forward_request(DECODE_SERVICE_URL, original_request_data)
+            # Pass the unmodified user request so the decode phase can continue
+            # sampling with the already-populated KV cache.
+            generator = _stream_decode(
+                request.path, original_request_data, headers, request_id
+            )
             response = await make_response(generator)
             response.timeout = None  # Disable timeout for streaming response
             return response
@@ -168,23 +242,10 @@ async def process_request():
     @app.route("/v1/completions", methods=["POST"])
     async def handle_request():
         """Handle incoming API requests with concurrency and rate limiting"""
-        # Create task for request processing
-        task = asyncio.create_task(process_request())
-
-        # Enqueue request or reject if queue is full
-        if not await request_queue.enqueue(task):
-            return Response(
-                response=b'{"error": "Server busy, try again later"}',
-                status=503,
-                content_type="application/json",
-            )
-
         try:
-            # Return the response from the processing task
-            return await task
+            return await process_request()
         except asyncio.CancelledError:
-            # Handle task cancellation (timeout or queue full)
-            logger.warning("Request cancelled due to timeout or queue full")
+            logger.warning("Request cancelled")
             return Response(
                 response=b'{"error": "Request cancelled"}',
                 status=503,
diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
index 027f67ad4db6..e07d6c776bc0 100644
--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -255,8 +255,8 @@ def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
         torch.cuda.synchronize()
 
         # Timing
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
+        start_event = torch.Event(enable_timing=True)
+        end_event = torch.Event(enable_timing=True)
 
         latencies = []
         for _ in range(num_iters):
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index 8787724d77cf..ac78c019a59e 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -237,6 +237,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
             b_q_weight=w_q,
             b_bias=None,
             b_scales=w_s,
+            a_scales=None,
             global_scale=None,
             b_zeros=w_zp,
             g_idx=g_idx,
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 12ca9214b1f9..48d790aec9e0 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -263,7 +263,7 @@ def gen_allspark_params():
 
     results.append(
         benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
@@ -273,7 +273,7 @@ def gen_allspark_params():
 
     results.append(
         benchmark.Timer(
-            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
+            stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, None, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)",  # noqa: E501
             globals=globals,
             label=label,
             sub_label=sub_label,
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c99951aa2782..a1af0b8aec3d 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -185,8 +185,8 @@ def run():
         graph.replay()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
index efa5a7386027..b8913a217c60 100644
--- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py
+++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py
@@ -105,8 +105,8 @@ def run():
         graph.replay()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
@@ -241,8 +241,8 @@ def run(input: tuple):
         graph.replay()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py
index cb848d2bf579..83bd91917508 100644
--- a/benchmarks/kernels/benchmark_mrope.py
+++ b/benchmarks/kernels/benchmark_mrope.py
@@ -6,7 +6,7 @@
 #
 # The CSV file (named with current date/time) contains these columns:
 # model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
-# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
+# is_neox_style, rope_parameters, dtype, torch_mean, torch_median, torch_p99,
 # torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
 # speedup
 #
@@ -86,9 +86,8 @@ def benchmark_mrope(
     num_heads: int,
     num_kv_heads: int,
     max_position: int = 8192,
-    rope_theta: float = 10000,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] = None,
+    rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype = torch.bfloat16,
     seed: int = 0,
     warmup_iter: int = 10,
@@ -102,9 +101,8 @@ def benchmark_mrope(
         head_size=head_dim,
         rotary_dim=head_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=rope_scaling,
+        rope_parameters=rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -203,9 +201,8 @@ def benchmark_mrope(
             num_kv_heads,
             head_dim,
             max_position,
-            rope_theta,
             is_neox_style,
-            str(rope_scaling),
+            str(rope_parameters),
             str(dtype).split(".")[-1],
             torch_stats["mean"],
             torch_stats["median"],
@@ -255,9 +252,8 @@ def benchmark_mrope(
             "num_kv_heads",
             "head_dim",
             "max_position",
-            "rope_theta",
             "is_neox_style",
-            "rope_scaling",
+            "rope_parameters",
             "dtype",
             "torch_mean",
             "torch_median",
@@ -303,7 +299,7 @@ def benchmark_mrope(
                 q_size = num_heads * head_dim
                 kv_size = num_kv_heads * head_dim
                 is_neox_style = True
-                rope_theta = config.rope_theta
+                rope_parameters = config.rope_parameters
                 max_position = config.max_position_embeddings
 
                 for num_tokens in num_tokens_list:
@@ -315,9 +311,8 @@ def benchmark_mrope(
                         num_heads=num_heads,
                         num_kv_heads=num_kv_heads,
                         max_position=max_position,
-                        rope_theta=rope_theta,
                         is_neox_style=is_neox_style,
-                        rope_scaling=config.rope_scaling,
+                        rope_parameters=rope_parameters,
                         dtype=getattr(torch, args.dtype),
                         seed=args.seed,
                         warmup_iter=args.warmup_iter,
diff --git a/benchmarks/kernels/benchmark_per_token_group_quant.py b/benchmarks/kernels/benchmark_per_token_group_quant.py
index bdc1eb733084..eba4d510258b 100644
--- a/benchmarks/kernels/benchmark_per_token_group_quant.py
+++ b/benchmarks/kernels/benchmark_per_token_group_quant.py
@@ -30,8 +30,8 @@ def _time_cuda(
         fn()
     torch.cuda.synchronize()
 
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
 
     start.record()
     for _ in range(bench_iters):
diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
index a5887aafd30d..de01ff197eab 100644
--- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
+++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
@@ -253,8 +253,8 @@ def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
         )
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     # Benchmark
     latencies: list[float] = []
diff --git a/benchmarks/kernels/benchmark_trtllm_decode_attention.py b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
index 29ce18234dfa..1d0d6fbb9a47 100644
--- a/benchmarks/kernels/benchmark_trtllm_decode_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_decode_attention.py
@@ -127,8 +127,8 @@ def benchmark_decode(
 
     def time_fn(fn, warmup=10, trials=20):
         torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
+        start = torch.Event(enable_timing=True)
+        end = torch.Event(enable_timing=True)
         times = []
         for i in range(warmup):
             fn()
diff --git a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
index 2a25d0374811..84bde723abf7 100644
--- a/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
+++ b/benchmarks/kernels/benchmark_trtllm_prefill_attention.py
@@ -139,8 +139,8 @@ def benchmark_prefill(
 
     def time_fn(fn, warmup=10, trials=20):
         torch.cuda.synchronize()
-        start = torch.cuda.Event(enable_timing=True)
-        end = torch.cuda.Event(enable_timing=True)
+        start = torch.Event(enable_timing=True)
+        end = torch.Event(enable_timing=True)
         times = []
         for i in range(warmup):
             fn()
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
index ab54f81985bc..b52500c8c521 100644
--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -183,8 +183,8 @@ def run():
         run()
     torch.cuda.synchronize()
 
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
+    start_event = torch.Event(enable_timing=True)
+    end_event = torch.Event(enable_timing=True)
 
     latencies: list[float] = []
     for i in range(num_iters):
diff --git a/benchmarks/kernels/deepgemm/README.md b/benchmarks/kernels/deepgemm/README.md
index 41e68e047be8..a28c6956be0e 100644
--- a/benchmarks/kernels/deepgemm/README.md
+++ b/benchmarks/kernels/deepgemm/README.md
@@ -2,7 +2,7 @@
 
 This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
 
-Currently this just includes dense GEMMs and only works on Hopper GPUs.
+Currently, this just includes dense GEMMs and only works on Hopper GPUs.
 
 ## Setup
 
diff --git a/benchmarks/multi_turn/README.md b/benchmarks/multi_turn/README.md
index f5b5c6c97d48..b0be1e3a69a6 100644
--- a/benchmarks/multi_turn/README.md
+++ b/benchmarks/multi_turn/README.md
@@ -55,6 +55,10 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 ----------------------------------------------------------------------------------------------------
 ```
 
+If you run with `--warmup-step`, the summary will also include `warmup_runtime_sec`
+and `total_runtime_incl_warmup_sec` (while `runtime_sec` continues to reflect the
+benchmark-only runtime so the reported throughput stays comparable).
+
 ### JSON configuration file for synthetic conversations generation
 
 The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
diff --git a/benchmarks/multi_turn/benchmark_serving_multi_turn.py b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
index ae9e9753441a..e23f6b923f1b 100644
--- a/benchmarks/multi_turn/benchmark_serving_multi_turn.py
+++ b/benchmarks/multi_turn/benchmark_serving_multi_turn.py
@@ -561,8 +561,11 @@ async def client_main(
         f"{Color.CYAN}Started client {client_id}: max_num_requests={args.max_num_requests}, max_active_conversations={args.max_active_conversations}{Color.RESET}"  # noqa: E501
     )
 
-    random.seed(args.seed)
-    np.random.seed(args.seed)
+    # Set unique seed per client (each client runs in its own process)
+    # Add 1 to ensure no client uses the same seed as the main process
+    client_seed = args.seed + client_id + 1
+    random.seed(client_seed)
+    np.random.seed(client_seed)
 
     # Active conversations
     active_convs: ConversationsMap = {}
@@ -1073,6 +1076,7 @@ def process_statistics(
     verbose: bool,
     gen_conv_args: GenConvArgs | None = None,
     excel_output: bool = False,
+    warmup_runtime_sec: float | None = None,
 ) -> None:
     if len(client_metrics) == 0:
         logger.info("No samples to process")
@@ -1166,8 +1170,13 @@ def process_statistics(
         # Convert milliseconds to seconds
         runtime_sec = runtime_sec / 1000.0
         requests_per_sec = float(len(df)) / runtime_sec
-
-        params = {"runtime_sec": runtime_sec, "requests_per_sec": requests_per_sec}
+        params = {
+            "runtime_sec": runtime_sec,
+            "requests_per_sec": requests_per_sec,
+        }
+        if warmup_runtime_sec is not None:
+            params["warmup_runtime_sec"] = warmup_runtime_sec
+            params["total_runtime_incl_warmup_sec"] = runtime_sec + warmup_runtime_sec
 
         # Generate a summary of relevant metrics (and drop irrelevant data)
         df = df.drop(columns=exclude).describe(percentiles=percentiles).transpose()
@@ -1490,6 +1499,7 @@ async def main() -> None:
             f"Invalid --warmup-percentage={args.warmup_percentage}"
         ) from None
 
+    # Set global seeds for main process
     random.seed(args.seed)
     np.random.seed(args.seed)
 
@@ -1548,6 +1558,8 @@ async def main() -> None:
         url=args.url, num_clients=args.num_clients, early_stop=not args.no_early_stop
     )
 
+    warmup_runtime_sec: float | None = None
+
     # Warm-up step
     if args.warmup_step:
         # Only send a single user prompt from every conversation.
@@ -1562,26 +1574,56 @@ async def main() -> None:
         # all clients should finish their work before exiting
         warmup_bench_args = bench_args._replace(early_stop=False)
 
-        logger.info(f"{Color.PURPLE}Warmup start{Color.RESET}")
+        logger.info("%sWarmup start%s", Color.PURPLE, Color.RESET)
+        warmup_start_ns = time.perf_counter_ns()
         conversations, _ = await main_mp(
             warmup_client_args, req_args, warmup_bench_args, tokenizer, conversations
         )
-        logger.info(f"{Color.PURPLE}Warmup done{Color.RESET}")
+        warmup_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - warmup_start_ns)
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.PURPLE,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info("%sWarmup done%s", Color.PURPLE, Color.RESET)
 
     # Run the benchmark
-    start_time = time.perf_counter_ns()
+    benchmark_start_ns = time.perf_counter_ns()
     client_convs, client_metrics = await main_mp(
         client_args, req_args, bench_args, tokenizer, conversations
     )
-    total_runtime_ms = nanosec_to_millisec(time.perf_counter_ns() - start_time)
+    benchmark_runtime_sec = nanosec_to_sec(time.perf_counter_ns() - benchmark_start_ns)
 
     # Calculate requests per second
-    total_runtime_sec = total_runtime_ms / 1000.0
-    rps = len(client_metrics) / total_runtime_sec
+    requests_per_sec = len(client_metrics) / benchmark_runtime_sec
+    benchmark_runtime_ms = benchmark_runtime_sec * 1000.0
     logger.info(
-        f"{Color.GREEN}All clients finished, total runtime: {total_runtime_sec:.3f} sec"
-        f" ({total_runtime_ms:.3f} ms), requests per second: {rps:.3f}{Color.RESET}"
+        "%sAll clients finished, benchmark runtime: %.3f sec (%.3f ms), "
+        "requests per second: %.3f%s",
+        Color.GREEN,
+        benchmark_runtime_sec,
+        benchmark_runtime_ms,
+        requests_per_sec,
+        Color.RESET,
     )
+    if warmup_runtime_sec is not None:
+        total_runtime_sec = benchmark_runtime_sec + warmup_runtime_sec
+        logger.info(
+            "%sWarmup runtime: %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            warmup_runtime_sec,
+            warmup_runtime_sec * 1000,
+            Color.RESET,
+        )
+        logger.info(
+            "%sTotal runtime (including warmup): %.3f sec (%.3f ms)%s",
+            Color.GREEN,
+            total_runtime_sec,
+            total_runtime_sec * 1000,
+            Color.RESET,
+        )
 
     # Benchmark parameters
     params = {
@@ -1606,6 +1648,7 @@ async def main() -> None:
         verbose=args.verbose,
         gen_conv_args=gen_conv_args,
         excel_output=args.excel_output,
+        warmup_runtime_sec=warmup_runtime_sec,
     )
 
     if args.output_file is not None:
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index bb0179c79c10..fbbb03c5ed46 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -242,7 +242,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
                 SUBBUILD_DIR "${FETCHCONTENT_BASE_DIR}/arm_compute-subbuild"
                 SOURCE_DIR   "${FETCHCONTENT_BASE_DIR}/arm_compute-src"
                 GIT_REPOSITORY https://github.com/ARM-software/ComputeLibrary.git
-                GIT_TAG        v52.2.0
+                GIT_TAG        v52.6.0
                 GIT_SHALLOW    TRUE
                 GIT_PROGRESS   TRUE
             )
@@ -310,7 +310,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
         FetchContent_Declare(
             oneDNN
             GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-            GIT_TAG v3.9
+            GIT_TAG v3.10
             GIT_PROGRESS TRUE
             GIT_SHALLOW TRUE
         )
@@ -375,6 +375,7 @@ set(VLLM_EXT_SRC
 if (AVX512_FOUND AND NOT AVX512_DISABLED)
     set(VLLM_EXT_SRC
         "csrc/cpu/shm.cpp"
+        "csrc/cpu/cpu_wna16.cpp"
         ${VLLM_EXT_SRC})
     if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
         set(VLLM_EXT_SRC
diff --git a/cmake/external_projects/triton_kernels.cmake b/cmake/external_projects/triton_kernels.cmake
new file mode 100644
index 000000000000..d35ad123dd9d
--- /dev/null
+++ b/cmake/external_projects/triton_kernels.cmake
@@ -0,0 +1,53 @@
+# Install OpenAI triton_kernels from https://github.com/triton-lang/triton/tree/main/python/triton_kernels
+
+set(DEFAULT_TRITON_KERNELS_TAG "v3.5.0")
+
+# Set TRITON_KERNELS_SRC_DIR for use with local development with vLLM. We expect TRITON_KERNELS_SRC_DIR to
+# be directly set to the triton_kernels python directory. 
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  message(STATUS "[triton_kernels] Fetch from $ENV{TRITON_KERNELS_SRC_DIR}")
+  FetchContent_Declare(
+          triton_kernels
+          SOURCE_DIR $ENV{TRITON_KERNELS_SRC_DIR}
+  )
+
+else()
+  set(TRITON_GIT "https://github.com/triton-lang/triton.git")
+  message (STATUS "[triton_kernels] Fetch from ${TRITON_GIT}:${DEFAULT_TRITON_KERNELS_TAG}")
+  FetchContent_Declare(
+          triton_kernels
+          # TODO (varun) : Fetch just the triton_kernels directory from Triton
+          GIT_REPOSITORY https://github.com/triton-lang/triton.git
+          GIT_TAG ${DEFAULT_TRITON_KERNELS_TAG}
+          GIT_PROGRESS TRUE
+          SOURCE_SUBDIR python/triton_kernels/triton_kernels
+  )
+endif()
+
+# Fetch content 
+FetchContent_MakeAvailable(triton_kernels)
+
+if (NOT triton_kernels_SOURCE_DIR)
+  message (FATAL_ERROR "[triton_kernels] Cannot resolve triton_kernels_SOURCE_DIR")
+endif()
+
+if (DEFINED ENV{TRITON_KERNELS_SRC_DIR})
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/")
+else()
+  set(TRITON_KERNELS_PYTHON_DIR "${triton_kernels_SOURCE_DIR}/python/triton_kernels/triton_kernels/")
+endif()
+
+message (STATUS "[triton_kernels] triton_kernels is available at ${TRITON_KERNELS_PYTHON_DIR}")
+
+add_custom_target(triton_kernels)
+
+# Ensure the vllm/third_party directory exists before installation
+install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/third_party/triton_kernels\")")
+
+## Copy .py files to install directory.
+install(DIRECTORY
+        ${TRITON_KERNELS_PYTHON_DIR}
+        DESTINATION 
+        vllm/third_party/triton_kernels/
+        COMPONENT triton_kernels
+        FILES_MATCHING PATTERN "*.py")
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 29db9fa273a4..ff687e0af7b4 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -38,7 +38,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG 8e1b01d56210dc72030a2d0d41c2d8d266ba6309
+          GIT_TAG 86f8f157cf82aa2342743752b97788922dd7de43
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index ca0062ba4fab..5047c354ff7d 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -495,7 +495,13 @@ function (define_extension_target MOD_NAME)
     set(SOABI_KEYWORD "")
   endif()
 
-  if (ARG_USE_SABI)
+  run_python(IS_FREETHREADED_PYTHON
+    "import sysconfig; print(1 if sysconfig.get_config_var(\"Py_GIL_DISABLED\") else 0)"
+    "Failed to determine whether interpreter is free-threaded")
+
+  # Free-threaded Python doesn't yet support the stable ABI (see PEP 803/809),
+  # so avoid using the stable ABI under free-threading only.
+  if (ARG_USE_SABI AND NOT IS_FREETHREADED_PYTHON)
     Python_add_library(${MOD_NAME} MODULE USE_SABI ${ARG_USE_SABI} ${SOABI_KEYWORD} "${ARG_SOURCES}")
   else()
     Python_add_library(${MOD_NAME} MODULE ${SOABI_KEYWORD} "${ARG_SOURCES}")
diff --git a/csrc/attention/merge_attn_states.cu b/csrc/attention/merge_attn_states.cu
index 229d9862fb67..27d1e990c611 100644
--- a/csrc/attention/merge_attn_states.cu
+++ b/csrc/attention/merge_attn_states.cu
@@ -16,7 +16,8 @@ __global__ void merge_attn_states_kernel(
     scalar_t* output, float* output_lse, const scalar_t* prefix_output,
     const float* prefix_lse, const scalar_t* suffix_output,
     const float* suffix_lse, const uint num_tokens, const uint num_heads,
-    const uint head_size) {
+    const uint head_size, const uint prefix_head_stride,
+    const uint output_head_stride) {
   using pack_128b_t = uint4;
   const uint pack_size = 16 / sizeof(scalar_t);
   const uint threads_per_head = head_size / pack_size;
@@ -34,11 +35,13 @@ __global__ void merge_attn_states_kernel(
   const uint head_idx = token_head_idx % num_heads;
 
   const uint pack_offset = pack_idx * pack_size;  // (0~15)*8, etc.
-  const uint head_offset =
-      token_idx * num_heads * head_size + head_idx * head_size;
-  const scalar_t* prefix_head_ptr = prefix_output + head_offset;
-  const scalar_t* suffix_head_ptr = suffix_output + head_offset;
-  scalar_t* output_head_ptr = output + head_offset;
+  const uint src_head_offset = token_idx * num_heads * prefix_head_stride +
+                               head_idx * prefix_head_stride;
+  const uint dst_head_offset = token_idx * num_heads * output_head_stride +
+                               head_idx * output_head_stride;
+  const scalar_t* prefix_head_ptr = prefix_output + src_head_offset;
+  const scalar_t* suffix_head_ptr = suffix_output + src_head_offset;
+  scalar_t* output_head_ptr = output + dst_head_offset;
 
   float p_lse = prefix_lse[head_idx * num_tokens + token_idx];
   float s_lse = suffix_lse[head_idx * num_tokens + token_idx];
@@ -140,7 +143,7 @@ __global__ void merge_attn_states_kernel(
             reinterpret_cast<float*>(prefix_lse.data_ptr()),                \
             reinterpret_cast<scalar_t*>(suffix_output.data_ptr()),          \
             reinterpret_cast<float*>(suffix_lse.data_ptr()), num_tokens,    \
-            num_heads, head_size);                                          \
+            num_heads, head_size, prefix_head_stride, output_head_stride);  \
   }
 
 /*@brief Merges the attention states from prefix and suffix
@@ -166,17 +169,11 @@ void merge_attn_states_launcher(torch::Tensor& output,
   const uint num_tokens = output.size(0);
   const uint num_heads = output.size(1);
   const uint head_size = output.size(2);
+  const uint prefix_head_stride = prefix_output.stride(1);
+  const uint output_head_stride = output.stride(1);
   const uint pack_size = 16 / sizeof(scalar_t);
   TORCH_CHECK(head_size % pack_size == 0,
               "headsize must be multiple of pack_size:", pack_size);
-  TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
-              "output heads must be contiguous in memory");
-  TORCH_CHECK(
-      prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
-      "prefix_output heads must be contiguous in memory");
-  TORCH_CHECK(
-      suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
-      "suffix_output heads must be contiguous in memory");
   float* output_lse_ptr = nullptr;
   if (output_lse.has_value()) {
     output_lse_ptr = output_lse.value().data_ptr<float>();
diff --git a/csrc/cache.h b/csrc/cache.h
index b162a4a2bc31..f2a5ec0acf5c 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -41,11 +41,12 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
                  const double scale, const std::string& kv_cache_dtype);
 
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt);
 
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index 0aa0dc14c748..8a5457206c70 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -552,7 +552,11 @@ __global__ void indexer_k_quant_and_cache_kernel(
 #ifndef USE_ROCM
   __syncwarp();
 #endif
+#if defined(__gfx942__)
+  float scale = fmaxf(amax, 1e-4) / 224.0f;
+#else
   float scale = fmaxf(amax, 1e-4) / 448.0f;
+#endif
   if (use_ue8m0) {
     scale = exp2f(ceilf(log2f(scale)));
   }
@@ -901,87 +905,80 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
 namespace vllm {
 
 // grid is launched with dimensions (batch, num_splits)
-template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
+template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt,
+          int ENTRY_SIZE, int CTA_SIZE>
 __global__ void gather_and_maybe_dequant_cache(
-    const cache_t* __restrict__ src_cache,    // [NUM_BLOCKS, BLOCK_SIZE,
-                                              // ENTRIES...]
-    scalar_t* __restrict__ dst,               // [TOT_TOKENS, ENTRIES...]
-    const int32_t* __restrict__ block_table,  // [BATCH, BLOCK_INDICES]
-    const int32_t* __restrict__ cu_seq_lens,  // [BATCH+1]
-    const int32_t block_size, const int32_t entry_size,
+    const cache_t* __restrict__ src_cache,     // [NUM_BLOCKS, BLOCK_SIZE,
+                                               // ENTRIES...]
+    scalar_t* __restrict__ dst,                // [TOT_TOKENS, ENTRIES...]
+    const int32_t* __restrict__ block_table,   // [BATCH, BLOCK_INDICES]
+    const int32_t* __restrict__ cu_seq_lens,   // [BATCH+1]
+    const int32_t* __restrict__ token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNK]
+    const int32_t num_tokens, const int32_t block_size,
     const int64_t block_table_stride, const int64_t cache_block_stride,
     const int64_t cache_entry_stride, const int64_t dst_entry_stride,
     const float* __restrict__ scale,
     const int32_t* __restrict__ seq_starts) {  // Optional: starting offsets per
                                                // batch
+  constexpr int vec_size = sizeof(float4) / sizeof(scalar_t);
+  using ltype = vllm::vec_n_t<cache_t, vec_size>;
+  using stype = vllm::vec_n_t<scalar_t, vec_size>;
+  // We are adding this for code readability which will be optimized out when
+  // build in release.
+  assert(CTA_SIZE == blockDim.x);
 
-  const int64_t bid = blockIdx.x;  // Batch ID
-  const int32_t num_splits = gridDim.y;
-  const int32_t split = blockIdx.y;
-  const int32_t seq_start = cu_seq_lens[bid];
-  const int32_t seq_end = cu_seq_lens[bid + 1];
-  const int32_t seq_len = seq_end - seq_start;
-  const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
-  const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
-
-  const int32_t split_start = split * split_blocks;
-  const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
-
-  const bool is_active_split = (split_start < tot_blocks);
-  const bool is_last_split = (split_end == tot_blocks);
-
-  if (!is_active_split) return;
-
-  int32_t full_blocks_end = split_end;
-  int32_t partial_block_size = 0;
-
-  // Adjust the pointer for the block_table for this batch.
-  // If seq_starts is provided, compute an offset based on (seq_starts[bid] /
-  // page_size)
-  const int32_t batch_offset = bid * block_table_stride;
-  int32_t offset = 0;
-  if (seq_starts != nullptr) {
-    offset = seq_starts[bid] / block_size;
-  }
-  const int32_t* batch_block_table = block_table + batch_offset + offset;
-
-  // Adjust dst pointer based on the cumulative sequence lengths.
-  dst += seq_start * dst_entry_stride;
-
-  if (is_last_split) {
-    partial_block_size = seq_len % block_size;
-    if (partial_block_size) full_blocks_end -= 1;
-  }
+#pragma unroll
+  for (int token_id = blockIdx.x; token_id < num_tokens;
+       token_id += gridDim.x) {
+    int64_t batch_id = token_to_seq[token_id];
+    int64_t batch_start = cu_seq_lens[batch_id];
+    int64_t batch_end = cu_seq_lens[batch_id + 1];
+    int32_t batch_offset = token_id - batch_start;
+
+    if (token_id >= batch_end) return;
+    int32_t offset = 0;
+    if (seq_starts != nullptr) {
+      offset = seq_starts[batch_id];
+    }
+    batch_offset += offset;
+    int32_t block_table_id = batch_offset / block_size;
+    int32_t slot_id = batch_offset % block_size;
+    int32_t block_table_offset = batch_id * block_table_stride + block_table_id;
+    int32_t block_id = block_table[block_table_offset];
+    int64_t cache_offset =
+        block_id * cache_block_stride + slot_id * cache_entry_stride;
+    constexpr int32_t vec_iter_cnt = ENTRY_SIZE / vec_size;
+    scalar_t* dst_ = dst + token_id * dst_entry_stride;
+    cache_t* src_ = const_cast<cache_t*>(src_cache) + cache_offset;
 
-  auto copy_entry = [&](const cache_t* __restrict__ _src,
-                        scalar_t* __restrict__ _dst) {
-    for (int i = threadIdx.x; i < entry_size; i += blockDim.x) {
+#pragma unroll
+    for (int idx = threadIdx.x; idx < vec_iter_cnt; idx += CTA_SIZE) {
       if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
-        _dst[i] = static_cast<scalar_t>(_src[i]);
+        reinterpret_cast<stype*>(dst_)[idx] =
+            static_cast<stype>(reinterpret_cast<ltype*>(src_)[idx]);
       } else {
-        _dst[i] =
-            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(_src[i], *scale);
+        ltype loaded_val = reinterpret_cast<ltype*>(src_)[idx];
+        stype store_val;
+#pragma unroll
+        for (int j = 0; j < vec_size; ++j) {
+          store_val.val[j] = fp8::scaled_convert<scalar_t, cache_t, kv_dt>(
+              loaded_val.val[j], *scale);
+        }
+        reinterpret_cast<stype*>(dst_)[idx] = store_val;
       }
     }
-  };
-
-  for (int pid = split_start; pid < full_blocks_end; ++pid) {
-    auto block_id = batch_block_table[pid];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
-    for (int eid = 0; eid < block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
-    }
-  }
-
-  if (partial_block_size) {
-    auto block_id = batch_block_table[full_blocks_end];
-    auto block_start_ptr = src_cache + block_id * cache_block_stride;
-    auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
-    for (int eid = 0; eid < partial_block_size; ++eid) {
-      copy_entry(block_start_ptr + eid * cache_entry_stride,
-                 block_dst_ptr + eid * dst_entry_stride);
+    // process tail
+    constexpr int32_t tail_cnt = ENTRY_SIZE % vec_size;
+    dst_ = dst_ + ENTRY_SIZE - tail_cnt;
+    src_ = src_ + ENTRY_SIZE - tail_cnt;
+#pragma unroll
+    for (int idx = threadIdx.x; idx < tail_cnt; idx += CTA_SIZE) {
+      if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
+        dst_[idx] = static_cast<scalar_t>(src_[idx]);
+      } else {
+        dst_[idx] =
+            fp8::scaled_convert<scalar_t, cache_t, kv_dt>(src_[idx], *scale);
+      }
     }
   }
 }
@@ -992,34 +989,38 @@ __global__ void gather_and_maybe_dequant_cache(
 // SCALAR_T is the data type of the destination tensor.
 // CACHE_T is the stored data type of kv-cache.
 // KV_DTYPE is the real data type of kv-cache.
-#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                      \
-  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE>         \
-      <<<grid, block, 0, stream>>>(                                         \
-          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                 \
-          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                      \
-          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
-          block_size, entry_size, block_table_stride, cache_block_stride,   \
-          cache_entry_stride, dst_entry_stride,                             \
-          reinterpret_cast<const float*>(scale.data_ptr()), seq_starts_ptr);
+#define CALL_GATHER_CACHE(SCALAR_T, CACHE_T, KV_DTYPE)                        \
+  vllm::gather_and_maybe_dequant_cache<SCALAR_T, CACHE_T, KV_DTYPE, 576,      \
+                                       thread_block_size>                     \
+      <<<grid, block, 0, stream>>>(                                           \
+          reinterpret_cast<CACHE_T*>(src_cache.data_ptr()),                   \
+          reinterpret_cast<SCALAR_T*>(dst.data_ptr()),                        \
+          block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(),   \
+          token_to_seq.data_ptr<int32_t>(), num_tokens, block_size,           \
+          block_table_stride, cache_block_stride, cache_entry_stride,         \
+          dst_entry_stride, reinterpret_cast<const float*>(scale.data_ptr()), \
+          seq_starts_ptr);
 
 // Gather sequences from the cache into the destination tensor.
 //  - cu_seq_lens contains the cumulative sequence lengths for each batch
 //  - block_table contains the cache block indices for each sequence
+//  - token_to_seq contains the back mapping from token_id to batch_id
 //  - Optionally, seq_starts (if provided) offsets the starting block index by
 //  (seq_starts[bid] / page_size)
 void gather_and_maybe_dequant_cache(
-    torch::Tensor const& src_cache,    // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
-    torch::Tensor const& dst,          // [TOT_TOKENS, ENTRIES...]
-    torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES]
-    torch::Tensor const& cu_seq_lens,  // [BATCH+1]
-    int64_t batch_size, const std::string& kv_cache_dtype,
+    torch::Tensor const& src_cache,     // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
+    torch::Tensor const& dst,           // [TOT_TOKENS, ENTRIES...]
+    torch::Tensor const& block_table,   // [BATCH, BLOCK_INDICES]
+    torch::Tensor const& cu_seq_lens,   // [BATCH+1]
+    torch::Tensor const& token_to_seq,  // [MAX_TOKEN_ACROSS_CHUNKS]
+    int64_t num_tokens, const std::string& kv_cache_dtype,
     torch::Tensor const& scale,
     std::optional<torch::Tensor> seq_starts = std::nullopt) {
   at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   int32_t block_size = src_cache.size(1);
-  int32_t entry_size = src_cache.flatten(2, -1).size(2);
+  int32_t head_dim = dst.size(-1);
 
   TORCH_CHECK(block_table.dtype() == torch::kInt32,
               "block_table must be int32");
@@ -1029,6 +1030,9 @@ void gather_and_maybe_dequant_cache(
     TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
                 "seq_starts must be int32");
   }
+  TORCH_CHECK(head_dim == 576,
+              "gather_and_maybe_dequant_cache only support the head_dim to 576 "
+              "for better performance")
 
   TORCH_CHECK(src_cache.device() == dst.device(),
               "src_cache and dst must be on the same device");
@@ -1046,10 +1050,9 @@ void gather_and_maybe_dequant_cache(
   int64_t cache_entry_stride = src_cache.stride(1);
   int64_t dst_entry_stride = dst.stride(0);
 
-  // Decide on the number of splits based on the batch size.
-  int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
-  dim3 grid(batch_size, num_splits);
-  dim3 block(1024);
+  constexpr int32_t thread_block_size = 64;
+  dim3 grid(num_tokens);
+  dim3 block(thread_block_size);
 
   const int32_t* seq_starts_ptr =
       seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp
index 50f17c758c14..92f8bee5a47a 100644
--- a/csrc/cpu/cpu_attn.cpp
+++ b/csrc/cpu/cpu_attn.cpp
@@ -13,6 +13,18 @@
   #define AMX_DISPATCH(...) case cpu_attention::ISA::AMX:
 #endif
 
+#ifdef __aarch64__
+  #include "cpu_attn_neon.hpp"
+  #define NEON_DISPATCH(...)                                                   \
+    case cpu_attention::ISA::NEON: {                                           \
+      using attn_impl = cpu_attention::AttentionImpl<cpu_attention::ISA::NEON, \
+                                                     scalar_t, head_dim>;      \
+      return __VA_ARGS__();                                                    \
+    }
+#else
+  #define NEON_DISPATCH(...) case cpu_attention::ISA::NEON:
+#endif  // #ifdef __aarch64__
+
 #define CPU_ATTN_DISPATCH_CASE(HEAD_DIM, ...) \
   case HEAD_DIM: {                            \
     constexpr size_t head_dim = HEAD_DIM;     \
@@ -41,6 +53,7 @@
   [&] {                                                                       \
     switch (ISA_TYPE) {                                                       \
       AMX_DISPATCH(__VA_ARGS__)                                               \
+      NEON_DISPATCH(__VA_ARGS__)                                              \
       case cpu_attention::ISA::VEC: {                                         \
         using attn_impl =                                                     \
             cpu_attention::AttentionImpl<cpu_attention::ISA::VEC, scalar_t,   \
@@ -73,6 +86,8 @@ torch::Tensor get_scheduler_metadata(
     isa = cpu_attention::ISA::VEC;
   } else if (isa_hint == "vec16") {
     isa = cpu_attention::ISA::VEC16;
+  } else if (isa_hint == "neon") {
+    isa = cpu_attention::ISA::NEON;
   } else {
     TORCH_CHECK(false, "Unsupported CPU attention ISA hint: " + isa_hint);
   }
@@ -158,6 +173,8 @@ void cpu_attn_reshape_and_cache(
       return cpu_attention::ISA::VEC;
     } else if (isa == "vec16") {
       return cpu_attention::ISA::VEC16;
+    } else if (isa == "neon") {
+      return cpu_attention::ISA::NEON;
     } else {
       TORCH_CHECK(false, "Invalid ISA type: " + isa);
     }
diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp
index 8f4c78099802..98f55d7c014b 100644
--- a/csrc/cpu/cpu_attn_impl.hpp
+++ b/csrc/cpu/cpu_attn_impl.hpp
@@ -1,16 +1,20 @@
 #ifndef CPU_ATTN_HPP
 #define CPU_ATTN_HPP
 
-#include <unistd.h>
 #include <type_traits>
 #include <cstddef>
 
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
 #include "cpu_types.hpp"
 #include "scratchpad_manager.h"
 #include "cpu_attn_macros.h"
+#include "utils.hpp"
 
 namespace cpu_attention {
-enum class ISA { AMX, VEC, VEC16 };
+enum class ISA { AMX, VEC, VEC16, NEON };
 
 template <ISA isa, typename scalar_t, int64_t head_dim>
 class AttentionImpl {};
@@ -139,6 +143,12 @@ struct AttentionMetadata {
       case ISA::VEC:
         ss << "VEC, ";
         break;
+      case ISA::VEC16:
+        ss << "VEC16, ";
+        break;
+      case ISA::NEON:
+        ss << "NEON, ";
+        break;
     }
     ss << "workitem_group_num: " << workitem_group_num
        << ", reduction_item_num: " << reduction_item_num
@@ -741,9 +751,21 @@ class AttentionScheduler {
 
   static int64_t get_available_l2_size() {
     static int64_t size = []() {
+#if defined(__APPLE__)
+      // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+      int64_t l2_cache_size = 0;
+      size_t len = sizeof(l2_cache_size);
+      if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+          l2_cache_size > 0) {
+        return l2_cache_size >> 1;  // use 50% of L2 cache
+      }
+      // Fallback if sysctlbyname fails
+      return 128LL * 1024 >> 1;  // use 50% of 128KB
+#else
       long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
       TORCH_CHECK_NE(l2_cache_size, -1);
       return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
     }();
     return size;
   }
@@ -816,15 +838,21 @@ struct VecTypeTrait<float> {
   using vec_t = vec_op::FP32Vec16;
 };
 
+// ARM only supports BF16 with ARMv8.6-A extension
+#if (defined(__aarch64__) && !defined(ARM_BF16_SUPPORT))
+#else
 template <>
 struct VecTypeTrait<c10::BFloat16> {
   using vec_t = vec_op::BF16Vec16;
 };
+#endif
 
+#if !defined(__powerpc__) && !defined(__s390x__)
 template <>
 struct VecTypeTrait<c10::Half> {
   using vec_t = vec_op::FP16Vec16;
 };
+#endif
 
 template <typename T>
 void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
@@ -1586,9 +1614,17 @@ class AttentionMainLoop {
 
               if (use_sink) {
                 alignas(64) float s_aux_fp32[16];
+#if defined(__aarch64__) && !defined(ARM_BF16_SUPPORT)
+                // ARM without native BF16 support: manual conversion
+                for (int i = 0; i < 16; ++i) {
+                  s_aux_fp32[i] = static_cast<float>(curr_s_aux[i]);
+                }
+#else
+                // All other platforms have BF16Vec16 available
                 vec_op::BF16Vec16 vec_bf16(curr_s_aux);
                 vec_op::FP32Vec16 vec_fp32(vec_bf16);
                 vec_fp32.save(s_aux_fp32);
+#endif
 
                 float* __restrict__ curr_sum_buffer = sum_buffer;
                 float* __restrict__ curr_max_buffer = max_buffer;
diff --git a/csrc/cpu/cpu_attn_neon.hpp b/csrc/cpu/cpu_attn_neon.hpp
new file mode 100644
index 000000000000..827f0cfbc718
--- /dev/null
+++ b/csrc/cpu/cpu_attn_neon.hpp
@@ -0,0 +1,386 @@
+#ifndef CPU_ATTN_NEON_HPP
+#define CPU_ATTN_NEON_HPP
+
+#include "cpu_attn_impl.hpp"
+#include <arm_neon.h>
+#include <type_traits>
+namespace cpu_attention {
+
+namespace {
+
+#define BLOCK_SIZE_ALIGNMENT 32
+#define HEAD_SIZE_ALIGNMENT 32
+#define MAX_Q_HEAD_NUM_PER_ITER 16
+
+// These do not use vectorized class for loading / converting
+// because csrc/cpu/cpu_types_arm.hpp does not have fallback options
+// for vec_op::BF16Vec* / vec_op::BF16Vec* on Arm HW that
+// doesn't support BF16.
+// We don't use vec_op::FP32Vec* or vec_op::FP16Vec* for consistency.
+template <typename kv_cache_t>
+FORCE_INLINE void load_row8_B_as_f32(const kv_cache_t* p, float32x4_t& b0,
+                                     float32x4_t& b1);
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<float>(const float* p, float32x4_t& b0,
+                                            float32x4_t& b1) {
+  b0 = vld1q_f32(p + 0);
+  b1 = vld1q_f32(p + 4);
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::Half>(const c10::Half* p,
+                                                float32x4_t& b0,
+                                                float32x4_t& b1) {
+  const float16_t* h = reinterpret_cast<const float16_t*>(p);
+  float16x8_t v = vld1q_f16(h);
+  b0 = vcvt_f32_f16(vget_low_f16(v));
+  b1 = vcvt_f32_f16(vget_high_f16(v));
+}
+
+template <>
+FORCE_INLINE void load_row8_B_as_f32<c10::BFloat16>(const c10::BFloat16* p,
+                                                    float32x4_t& b0,
+                                                    float32x4_t& b1) {
+  const uint16_t* u = reinterpret_cast<const uint16_t*>(p);
+#ifdef ARM_BF16_SUPPORT
+  uint16x8_t u0 = vld1q_u16(u);
+  bfloat16x8_t bf0 = vreinterpretq_bf16_u16(u0);
+  b0 = vcvtq_low_f32_bf16(bf0);
+  b1 = vcvtq_high_f32_bf16(bf0);
+#else
+  uint16x8_t x0 = vld1q_u16(u);
+  uint32x4_t lo = vshlq_n_u32(vmovl_u16(vget_low_u16(x0)), 16);
+  uint32x4_t hi = vshlq_n_u32(vmovl_u16(vget_high_u16(x0)), 16);
+  b0 = vreinterpretq_f32_u32(lo);
+  b1 = vreinterpretq_f32_u32(hi);
+#endif
+}
+
+// Mx8, with 1 <= M <= 8 , K streamed, unroll-by-4 with NEON FMLAs
+// #Loads = (K // 4) * (M + 4 * sizeof(kv_cache_t) / 2)
+// #FMLAs = (K // 4) * (4 * 2 * M)
+// We have (4 * 2 * M) FMLAs for (M + 4 * sizeof(kv_cache_t) / 2) loads
+template <int32_t M, typename kv_cache_t>
+FORCE_INLINE void gemm_micro_neon_fmla_Mx8_Ku4(
+    const float* __restrict A,       // [M x K],
+    const kv_cache_t* __restrict B,  // [K x 8],
+    float* __restrict C,             // [M x 8],
+    int64_t lda, int64_t ldb, int64_t ldc, int32_t K, bool accumulate) {
+  // kernel supports max M of 8, as it'd spill for larger M
+  static_assert(1 <= M && M <= 8, "M must be in [1,8]");
+
+// helpers for per-M codegen
+#define ROWS_APPLY(OP) OP(0) OP(1) OP(2) OP(3) OP(4) OP(5) OP(6) OP(7)
+#define IF_M(i) if constexpr (M > (i))
+
+  // A row base pointers
+#define DECL_A(i) const float* a##i = A + (i) * lda;
+  ROWS_APPLY(DECL_A)
+#undef DECL_A
+
+  // declare 2 accumulators per row of M
+#define DECL_ACC(i) float32x4_t acc##i##_0, acc##i##_1;
+  ROWS_APPLY(DECL_ACC)
+#undef DECL_ACC
+
+  // initialize accumulators
+#define INIT_ACC(i)                              \
+  IF_M(i) {                                      \
+    if (accumulate) {                            \
+      acc##i##_0 = vld1q_f32(C + (i) * ldc + 0); \
+      acc##i##_1 = vld1q_f32(C + (i) * ldc + 4); \
+    } else {                                     \
+      acc##i##_0 = vdupq_n_f32(0.f);             \
+      acc##i##_1 = vdupq_n_f32(0.f);             \
+    }                                            \
+  }
+  ROWS_APPLY(INIT_ACC)
+#undef INIT_ACC
+
+  int32_t k = 0;
+
+  // K unrolled by 4
+  for (; k + 3 < K; k += 4) {
+    // load A[k..k+3] for each active row (M)
+#define LOAD_A4(i)     \
+  float32x4_t a##i##v; \
+  IF_M(i) a##i##v = vld1q_f32(a##i + k);
+    ROWS_APPLY(LOAD_A4)
+#undef LOAD_A4
+
+    // helper: FMA lane L from aiv
+#define FMAS_LANE(i, aiv, L)                              \
+  IF_M(i) {                                               \
+    acc##i##_0 = vfmaq_laneq_f32(acc##i##_0, b0, aiv, L); \
+    acc##i##_1 = vfmaq_laneq_f32(acc##i##_1, b1, aiv, L); \
+  }
+
+    // k + 0
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 0) * ldb, b0, b1);
+#define STEP_K0(i) FMAS_LANE(i, a##i##v, 0)
+      ROWS_APPLY(STEP_K0)
+#undef STEP_K0
+    }
+    // k + 1
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 1) * ldb, b0, b1);
+#define STEP_K1(i) FMAS_LANE(i, a##i##v, 1)
+      ROWS_APPLY(STEP_K1)
+#undef STEP_K1
+    }
+    // k + 2
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 2) * ldb, b0, b1);
+#define STEP_K2(i) FMAS_LANE(i, a##i##v, 2)
+      ROWS_APPLY(STEP_K2)
+#undef STEP_K2
+    }
+    // k + 3
+    {
+      float32x4_t b0, b1;
+      load_row8_B_as_f32<kv_cache_t>(B + (int64_t)(k + 3) * ldb, b0, b1);
+#define STEP_K3(i) FMAS_LANE(i, a##i##v, 3)
+      ROWS_APPLY(STEP_K3)
+#undef STEP_K3
+    }
+#undef FMAS_LANE
+  }
+
+  // K tail
+  for (; k < K; ++k) {
+    float32x4_t b0, b1;
+    load_row8_B_as_f32<kv_cache_t>(B + (int64_t)k * ldb, b0, b1);
+#define TAIL_ROW(i)                             \
+  IF_M(i) {                                     \
+    float32x4_t ai = vdupq_n_f32(*(a##i + k));  \
+    acc##i##_0 = vfmaq_f32(acc##i##_0, b0, ai); \
+    acc##i##_1 = vfmaq_f32(acc##i##_1, b1, ai); \
+  }
+    ROWS_APPLY(TAIL_ROW)
+#undef TAIL_ROW
+  }
+
+  // store accumulators to C
+#define STORE_ROW(i)                          \
+  IF_M(i) {                                   \
+    vst1q_f32(C + (i) * ldc + 0, acc##i##_0); \
+    vst1q_f32(C + (i) * ldc + 4, acc##i##_1); \
+  }
+  ROWS_APPLY(STORE_ROW)
+#undef STORE_ROW
+
+#undef ROWS_APPLY
+#undef IF_M
+}
+
+template <int32_t N, typename kv_cache_t>
+FORCE_INLINE void gemm_macro_neon_fmla_Mx8_Ku4(const float* __restrict A,
+                                               const kv_cache_t* __restrict B,
+                                               float* __restrict C, int32_t M,
+                                               int32_t K, int64_t lda,
+                                               int64_t ldb, int64_t ldc,
+                                               bool accumulate) {
+  // micro kernel is Mx8
+  static_assert(N % 8 == 0, "N must be a multiple of 8");
+  for (int32_t m = 0; m < M;) {
+    int32_t mb = (M - m >= 8) ? 8 : (M - m >= 4) ? 4 : (M - m >= 2) ? 2 : 1;
+    const float* Ab = A + m * lda;
+    float* Cb = C + m * ldc;
+
+    for (int32_t n = 0; n < N; n += 8) {
+      const kv_cache_t* Bn = B + n;
+      float* Cn = Cb + n;
+      switch (mb) {
+        case 8:
+          gemm_micro_neon_fmla_Mx8_Ku4<8, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 4:
+          gemm_micro_neon_fmla_Mx8_Ku4<4, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        case 2:
+          gemm_micro_neon_fmla_Mx8_Ku4<2, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+        default:
+          gemm_micro_neon_fmla_Mx8_Ku4<1, kv_cache_t>(Ab, Bn, Cn, lda, ldb, ldc,
+                                                      K, accumulate);
+          break;
+      }
+    }
+    // no tail loop for N as it's guaranteed to be a multiple of 8
+    m += mb;
+  }
+}
+
+template <typename kv_cache_t>
+class TileGemmNeonFMLA {
+ public:
+  template <AttentionGemmPhase phase, int32_t k_size>
+  FORCE_INLINE static void gemm(const int32_t m_size,
+                                float* __restrict__ a_tile,
+                                kv_cache_t* __restrict__ b_tile,
+                                float* __restrict__ c_tile, const int64_t lda,
+                                const int64_t ldb, const int64_t ldc,
+                                const int32_t block_size,
+                                const int32_t dynamic_k_size,
+                                const bool accum_c) {
+    if constexpr (phase == AttentionGemmPhase::QK) {
+      gemm_macro_neon_fmla_Mx8_Ku4<BLOCK_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, k_size, lda, ldb, ldc, accum_c);
+    } else {
+      gemm_macro_neon_fmla_Mx8_Ku4<HEAD_SIZE_ALIGNMENT, kv_cache_t>(
+          a_tile, b_tile, c_tile, m_size, dynamic_k_size, lda, ldb, ldc,
+          accum_c);
+    }
+  }
+};
+
+}  // namespace
+
+// this is similar to "ISA::VEC" at the moment
+template <typename scalar_t, int64_t head_dim>
+class AttentionImpl<ISA::NEON, scalar_t, head_dim> {
+ public:
+  using query_t = scalar_t;
+  using q_buffer_t = float;
+  using kv_cache_t = scalar_t;
+  using logits_buffer_t = float;
+  using partial_output_buffer_t = float;
+  using prob_buffer_t = float;
+
+  constexpr static int64_t BlockSizeAlignment =
+      BLOCK_SIZE_ALIGNMENT;  // KV token num unit of QK and PV phases
+  constexpr static int64_t HeadDimAlignment =
+      HEAD_SIZE_ALIGNMENT;  // headdim num unit of PV phase
+  constexpr static int64_t MaxQHeadNumPerIteration = MAX_Q_HEAD_NUM_PER_ITER;
+  constexpr static int64_t HeadDim = head_dim;
+  constexpr static ISA ISAType = ISA::NEON;
+  constexpr static bool scale_on_logits = false;  // apply scale on q_buffer
+
+  static_assert(HeadDim % HeadDimAlignment == 0);
+  // the gemm micro kernel is Mx8
+  static_assert(HeadDimAlignment % 8 == 0);
+  static_assert(BlockSizeAlignment % 8 == 0);
+
+ public:
+  template <template <typename tile_gemm_t> typename attention>
+  FORCE_INLINE void execute_attention(DEFINE_CPU_ATTENTION_PARAMS) {
+    attention<TileGemmNeonFMLA<kv_cache_t>> attention_iteration;
+    attention_iteration(CPU_ATTENTION_PARAMS);
+  }
+
+  // k_cache_token_group_stride: stride of K cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t k_cache_token_group_stride(
+      const int32_t block_size) {
+    return BlockSizeAlignment;  // layout of k_cache block is [head_dim,
+                                // block_size], row-major
+  }
+
+  // v_cache_token_group_stride: stride of V cache when move to next
+  // BlockSizeAlignment tokens in a block
+  constexpr static int64_t v_cache_token_group_stride(
+      const int32_t block_size) {
+    return head_dim * BlockSizeAlignment;  // layout of v_cache is [block_size,
+                                           // head_dim], row-major
+  }
+
+  // v_cache_head_group_stride: stride of V cache when move to next
+  // HeadDimAlignment head dims in a block
+  constexpr static int64_t v_cache_head_group_stride(const int32_t block_size) {
+    return HeadDimAlignment;  // layout of v_cache is [block_size, head_dim],
+                              // row-major
+  }
+
+  // Copy q to q_buffer and cast it to fp32
+  static void copy_q_heads_tile(
+      scalar_t* __restrict__ src,  // [q_num, q_heads_per_kv, head_size]
+      float* __restrict__ q_buffer, const int32_t q_num,
+      const int32_t q_heads_per_kv, const int64_t q_num_stride,
+      const int64_t q_head_stride, float scale) {
+    static_assert(head_dim % 16 == 0);
+    constexpr int32_t unroll_size = head_dim / 16;
+    using load_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+    vec_op::FP32Vec16 scale_vec(scale);
+    for (int32_t q_num_idx = 0; q_num_idx < q_num; ++q_num_idx) {
+      for (int32_t q_head_idx = 0; q_head_idx < q_heads_per_kv; ++q_head_idx) {
+        scalar_t* __restrict__ curr_q =
+            src + q_num_idx * q_num_stride + q_head_idx * q_head_stride;
+        float* __restrict__ curr_q_buffer =
+            q_buffer + q_num_idx * q_heads_per_kv * head_dim +
+            q_head_idx * head_dim;
+
+        vec_op::unroll_loop<int32_t, unroll_size>([&](int32_t i) {
+          load_vec_t vec(curr_q);
+          vec_op::FP32Vec16 fp32_vec(vec);
+          fp32_vec = fp32_vec * scale_vec;
+          fp32_vec.save(curr_q_buffer);
+
+          curr_q += 16;
+          curr_q_buffer += 16;
+        });
+      }
+    }
+  }
+
+  // reshape K as column-major and V as row-major
+  static void reshape_and_cache(
+      const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
+      scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
+      const int64_t* __restrict__ slot_mapping, const int64_t token_num,
+      const int64_t key_token_num_stride, const int64_t value_token_num_stride,
+      const int64_t head_num, const int64_t key_head_num_stride,
+      const int64_t value_head_num_stride, const int64_t num_blocks,
+      const int64_t num_blocks_stride, const int64_t cache_head_num_stride,
+      const int64_t block_size, const int64_t block_size_stride) {
+#pragma omp parallel for collapse(2)
+    for (int64_t token_idx = 0; token_idx < token_num; ++token_idx) {
+      for (int64_t head_idx = 0; head_idx < head_num; ++head_idx) {
+        const int64_t pos = slot_mapping[token_idx];
+        if (pos < 0) {
+          // skip
+          continue;
+        }
+
+        const int64_t block_idx = pos / block_size;
+        const int64_t block_offset = pos % block_size;
+        {
+          // Write Key
+          const scalar_t* key_start_ptr = key +
+                                          token_idx * key_token_num_stride +
+                                          head_idx * key_head_num_stride;
+          scalar_t* key_cache_start_ptr =
+              key_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset;
+
+#pragma GCC unroll 8
+          for (int64_t i = 0, j = 0; i < head_dim; ++i, j += block_size) {
+            key_cache_start_ptr[j] = key_start_ptr[i];
+          }
+        }
+        {
+          // Write Value
+          const scalar_t* value_start_ptr = value +
+                                            token_idx * value_token_num_stride +
+                                            head_idx * value_head_num_stride;
+          scalar_t* value_cache_start_ptr =
+              value_cache + block_idx * num_blocks_stride +
+              head_idx * cache_head_num_stride + block_offset * head_dim;
+          std::memcpy(value_cache_start_ptr, value_start_ptr,
+                      sizeof(scalar_t) * head_dim);
+        }
+      }
+    }
+  }
+};
+}  // namespace cpu_attention
+
+#endif  // #ifndef CPU_ATTN_NEON_HPP
diff --git a/csrc/cpu/cpu_types_scalar.hpp b/csrc/cpu/cpu_types_scalar.hpp
index 1a9278bc662e..f9da78283da5 100644
--- a/csrc/cpu/cpu_types_scalar.hpp
+++ b/csrc/cpu/cpu_types_scalar.hpp
@@ -26,10 +26,6 @@ namespace vec_op {
 
 #define FORCE_INLINE __attribute__((always_inline)) inline
 
-#define __max(a, b) ((a) > (b) ? (a) : (b))
-#define __min(a, b) ((a) < (b) ? (a) : (b))
-#define __abs(a) ((a) < (0) ? (0 - a) : (a))
-
 typedef struct f16x8_t {
   uint16_t val[8];
 } f16x8_t;
@@ -99,7 +95,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
     std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
   }
 };
@@ -128,7 +124,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   void save(void* ptr) const { *reinterpret_cast<f16x16_t*>(ptr) = reg; }
 
   void save(void* ptr, const int elem_num) const {
-    int num = __min(elem_num, VEC_ELEM_NUM);
+    int num = std::min(elem_num, VEC_ELEM_NUM);
     std::memcpy(ptr, &(reg.val[0]), num * sizeof(uint16_t));
   }
 };
@@ -143,9 +139,9 @@ struct BF16Vec32 : public Vec<BF16Vec32> {
   explicit BF16Vec32(f16x32_t data) : reg(data) {};
 
   explicit BF16Vec32(BF16Vec8& vec8_data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&vec8_data, this](int i) {
       reg.val[i] = vec8_data.reg.val[i % BF16Vec8::VEC_ELEM_NUM];
-    }
+    });
   }
 
   void save(void* ptr) const { *reinterpret_cast<f16x32_t*>(ptr) = reg; }
@@ -157,15 +153,11 @@ struct FP32Vec4 : public Vec<FP32Vec4> {
   f32x4_t reg;
 
   explicit FP32Vec4(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec4() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec4(const float* ptr)
@@ -182,15 +174,11 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   f32x8_t reg;
 
   explicit FP32Vec8(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec8() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec8(const float* ptr)
@@ -201,78 +189,68 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {};
 
   explicit FP32Vec8(const FP16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
   }
 
   FP32Vec8(const BF16Vec8& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
   }
 
   float reduce_sum() const {
     float result = 0;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
     return result;
   }
 
   FP32Vec8 exp() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = expf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = expf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 tanh() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = tanhf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = tanhf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 er() const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = erf(reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = erf(reg.val[i]); });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator*(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] * b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator+(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] + b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator-(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] - b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
   FP32Vec8 operator/(const FP32Vec8& b) const {
     f32x8_t ret;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      ret.val[i] = reg.val[i] / b.reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
     return FP32Vec8(ret);
   }
 
@@ -284,15 +262,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   f32x16_t reg;
 
   explicit FP32Vec16(float v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = v;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([&v, this](int i) { reg.val[i] = v; });
   }
 
   explicit FP32Vec16() {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = 0.0f;
-    }
+    unroll_loop<int, VEC_ELEM_NUM>([this](int i) { reg.val[i] = 0.0f; });
   }
 
   explicit FP32Vec16(const float* ptr)
@@ -301,29 +275,27 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(f32x16_t data) : reg(data) {};
 
   FP32Vec16(const FP32Vec4& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
       reg.val[i] = data.reg.val[i % FP32Vec4::VEC_ELEM_NUM];
-    }
+    });
   }
 
   FP32Vec16(const FP32Vec8& data) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+    unroll_loop<int, VEC_ELEM_NUM>([&data, this](int i) {
       reg.val[i] = data.reg.val[i % FP32Vec8::VEC_ELEM_NUM];
-    }
+    });
   }
 
   FP32Vec16(const FP32Vec16& data) : reg(data.reg) {};
 
   explicit FP32Vec16(const FP16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = fp16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = fp16_to_float(v.reg.val[i]); });
   }
 
   explicit FP32Vec16(const BF16Vec16& v) {
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      reg.val[i] = bf16_to_float(v.reg.val[i]);
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&v, this](int i) { reg.val[i] = bf16_to_float(v.reg.val[i]); });
   }
 
   explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
@@ -331,82 +303,74 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {};
 
   FP32Vec16 operator*(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] * b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] * b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator+(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] + b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] + b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator-(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] - b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] - b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 operator/(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = reg.val[i] / b.reg.val[i];
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, &b, this](int i) { ret.val[i] = reg.val[i] / b.reg.val[i]; });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 max(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __max(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::max(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 min(const FP32Vec16& b) const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __min(reg.val[i], b.reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>([&ret, &b, this](int i) {
+      ret.val[i] = std::min(reg.val[i], b.reg.val[i]);
+    });
+    return FP32Vec16(ret);
   }
 
   FP32Vec16 abs() const {
-    FP32Vec16 result(0.0f);
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result.reg.val[i] = __abs(reg.val[i]);
-    }
-    return result;
+    f32x16_t ret;
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&ret, this](int i) { ret.val[i] = std::abs(reg.val[i]); });
+    return FP32Vec16(ret);
   }
 
   float reduce_sum() const {
     float result = 0.0f;
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result += reg.val[i];
-    }
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result += reg.val[i]; });
     return result;
   }
 
   float reduce_max() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __max(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::lowest();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::max(reg.val[i], result); });
     return result;
   }
 
   float reduce_min() const {
-    float result = reg.val[0];
-    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
-      result = __min(reg.val[i], result);
-    }
+    float result = std::numeric_limits<float>::max();
+    unroll_loop<int, VEC_ELEM_NUM>(
+        [&result, this](int i) { result = std::min(reg.val[i], result); });
     return result;
   }
 
@@ -414,13 +378,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   float reduce_sub_sum(int idx) {
     static_assert(VEC_ELEM_NUM % group_size == 0);
     float sum = 0.0;
-    int start = idx * group_size;
-    int end = (idx + 1) * group_size;
-
-    for (; (start < VEC_ELEM_NUM) && (start < end); ++start) {
-      sum += reg.val[start];
-    }
-
+    const int start = idx * group_size;
+    unroll_loop<int, group_size>(
+        [&sum, &start, this](int i) { sum += reg.val[start + i]; });
     return sum;
   }
 
@@ -477,17 +437,13 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
 }
 
 inline FP16Vec16::FP16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }
 
 inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < FP16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_fp16(v.reg.val[i]);
-  }
+  unroll_loop<int, FP16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_fp16(v.reg.val[i]); });
 }
 
 inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
@@ -495,17 +451,13 @@ inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
 }
 
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec8::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec8::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }
 
 inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
-  int i = 0;
-  for (i = 0; i < BF16Vec16::VEC_ELEM_NUM; ++i) {
-    reg.val[i] = float_to_bf16(v.reg.val[i]);
-  }
+  unroll_loop<int, BF16Vec16::VEC_ELEM_NUM>(
+      [&v, this](int i) { reg.val[i] = float_to_bf16(v.reg.val[i]); });
 }
 
 inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 3); }
diff --git a/csrc/cpu/cpu_types_vxe.hpp b/csrc/cpu/cpu_types_vxe.hpp
index 51bca37e699b..9efd8b7ec14a 100644
--- a/csrc/cpu/cpu_types_vxe.hpp
+++ b/csrc/cpu/cpu_types_vxe.hpp
@@ -4,6 +4,7 @@
 
 #include <vecintrin.h>
 #include <cmath>
+#include <limits>
 #include <torch/all.h>
 namespace vec_op {
 
@@ -174,8 +175,9 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   explicit FP32Vec8(const BF16Vec8& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg, zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg, zero);
   }
 
   float reduce_sum() const {
@@ -189,51 +191,257 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
   }
 
   FP32Vec8 exp() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::exp(ar.values[0]);
-    ret.val[0][1] = std::exp(ar.values[1]);
-    ret.val[0][2] = std::exp(ar.values[2]);
-    ret.val[0][3] = std::exp(ar.values[3]);
-    ret.val[1][0] = std::exp(ar.values[4]);
-    ret.val[1][1] = std::exp(ar.values[5]);
-    ret.val[1][2] = std::exp(ar.values[6]);
-    ret.val[1][3] = std::exp(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    f32x4x2_t out;
+
+    const __vector float log2e = vec_splats(1.44269504088896341f);
+    const __vector float one = vec_splats(1.0f);
+    const __vector float min_x = vec_splats(-87.3f);
+    const __vector float max_x = vec_splats(88.7f);
+
+    // 5th-degree minimax polynomial for 2^r (r in [0,1))
+    const __vector float c1 = vec_splats(0.6931471805599453f);
+    const __vector float c2 = vec_splats(0.240226506959101f);
+    const __vector float c3 = vec_splats(0.05550410866482158f);
+    const __vector float c4 = vec_splats(0.009618129107628477f);
+    const __vector float c5 = vec_splats(0.0013333558146428443f);
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+
+      x = vec_max(x, min_x);
+      x = vec_min(x, max_x);
+
+      __vector float y = vec_mul(x, log2e);
+
+      __vector float kf = vec_floor(y);
+      __vector float r = vec_sub(y, kf);
+
+      __vector signed int k = vec_signed(kf);
+      const __vector signed int min_k = vec_splats((signed int)-126);
+      const __vector signed int max_k = vec_splats((signed int)127);
+      k = vec_min(vec_max(k, min_k), max_k);
+
+      // Build 2^k from exponent bits
+      __vector signed int exp_int = vec_add(k, vec_splats((signed int)127));
+      __vector unsigned int bits = (__vector unsigned int)exp_int;
+      bits = vec_sl(bits, vec_splats((unsigned int)23));
+      __vector float pow2k = (__vector float)bits;
+
+      // Improved minimax polynomial
+      __vector float poly = vec_madd(c5, r, c4);
+      poly = vec_madd(poly, r, c3);
+      poly = vec_madd(poly, r, c2);
+      poly = vec_madd(poly, r, c1);
+      poly = vec_madd(poly, r, one);
+
+      out.val[i] = vec_mul(pow2k, poly);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 tanh() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::tanh(ar.values[0]);
-    ret.val[0][1] = std::tanh(ar.values[1]);
-    ret.val[0][2] = std::tanh(ar.values[2]);
-    ret.val[0][3] = std::tanh(ar.values[3]);
-    ret.val[1][0] = std::tanh(ar.values[4]);
-    ret.val[1][1] = std::tanh(ar.values[5]);
-    ret.val[1][2] = std::tanh(ar.values[6]);
-    ret.val[1][3] = std::tanh(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
+    const __vector float one = vec_splats(1.0f);
+    const __vector float two = vec_splats(2.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float sat =
+        vec_splats(9.0f);  // beyond this, tanh(x) ~ sign(x)
+
+    f32x4x2_t out;
+
+    for (int i = 0; i < 2; i++) {
+      __vector float x = reg.val[i];
+      __vector float ax = vec_abs(x);
+
+      // sign(x): +1 or -1
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // saturation mask: |x| > sat
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // 2x
+      __vector float two_x = vec_mul(x, two);
+
+      // Build a temporary FP32Vec8 with both lanes = 2x, reuse exp()
+      f32x4x2_t tmp;
+      tmp.val[0] = two_x;
+      tmp.val[1] = two_x;
+      FP32Vec8 exp_2x_vec(tmp);
+
+      FP32Vec8 e2x = exp_2x_vec.exp();
+      __vector float e = e2x.reg.val[i];
+
+      // tanh(x) = (e - 1) / (e + 1)
+      __vector float num = vec_sub(e, one);
+      __vector float den = vec_add(e, one);
+
+      __vector float t = vec_div(num, den);
+
+      // For large |x|, clamp to sign(x)
+      out.val[i] = vec_sel(t, sign, saturated);
+    }
+
+    return FP32Vec8(out);
   }
 
   FP32Vec8 er() const {
-    // TODO: Vectorize this
-    AliasReg ar;
-    ar.reg = reg;
-    f32x4x4_t ret;
-    ret.val[0][0] = std::erf(ar.values[0]);
-    ret.val[0][1] = std::erf(ar.values[1]);
-    ret.val[0][2] = std::erf(ar.values[2]);
-    ret.val[0][3] = std::erf(ar.values[3]);
-    ret.val[1][0] = std::erf(ar.values[4]);
-    ret.val[1][1] = std::erf(ar.values[5]);
-    ret.val[1][2] = std::erf(ar.values[6]);
-    ret.val[1][3] = std::erf(ar.values[7]);
-    return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
+    // A&S 7.1.26 approximation:
+    // erf(x) = sign(x) * (1 - ((((a5*t + a4)*t + a3)*t + a2)*t + a1) * t *
+    // exp(-x^2)) t = 1 / (1 + p*|x|),  p = 0.3275911
+
+    const __vector float one = vec_splats(1.0f);
+    const __vector float zero = vec_splats(0.0f);
+    const __vector float p = vec_splats(0.3275911f);
+
+    // Polynomial coeffs
+    const __vector float a1 = vec_splats(0.254829592f);
+    const __vector float a2 = vec_splats(-0.284496736f);
+    const __vector float a3 = vec_splats(1.421413741f);
+    const __vector float a4 = vec_splats(-1.453152027f);
+    const __vector float a5 = vec_splats(1.061405429f);
+
+    // Threshold where erf(x) ~ sign(x)
+    const __vector float sat = vec_splats(6.0f);
+
+    f32x4x2_t out;
+
+    for (int lane = 0; lane < 2; lane++) {
+      __vector float x = reg.val[lane];
+      __vector float ax = vec_abs(x);
+
+      // sign(x)
+      __vector float sign = vec_sel(vec_splats(-1.0f), one, vec_cmpgt(x, zero));
+
+      // |x| > 6 → erf(x) = ±1
+      __vector __bool int saturated = vec_cmpgt(ax, sat);
+
+      // t = 1 / (1 + p * |x|)
+      __vector float t = vec_madd(p, ax, one);
+      t = vec_div(one, t);
+
+      // poly = a5
+      __vector float poly = a5;
+      poly = vec_madd(poly, t, a4);
+      poly = vec_madd(poly, t, a3);
+      poly = vec_madd(poly, t, a2);
+      poly = vec_madd(poly, t, a1);
+
+      // full polynomial: poly = poly * t
+      poly = vec_mul(poly, t);
+
+      // Compute exp(-x^2)
+      __vector float x2 = vec_mul(x, x);
+      __vector float neg_x2 = vec_neg(x2);
+
+      f32x4x2_t tmp;
+      tmp.val[0] = neg_x2;
+      tmp.val[1] = neg_x2;
+      FP32Vec8 exp_neg_x2(tmp);
+
+      FP32Vec8 e = exp_neg_x2.exp();
+      __vector float ex = e.reg.val[lane];
+
+      // erf(x) = sign * (1 - poly * exp(-x^2))
+      __vector float term = vec_mul(poly, ex);
+      __vector float y = vec_sub(one, term);
+      y = vec_mul(y, sign);
+
+      // saturated → ±1
+      __vector float sat_val = vec_mul(sign, one);
+      out.val[lane] = vec_sel(y, sat_val, saturated);
+    }
+
+    return FP32Vec8(out);
+  }
+  // Elementwise sigmoid(x) = 1 / (1 + exp(-x))
+  FP32Vec8 sigmoid() const {
+    const __vector float one = vec_splats(1.0f);
+
+    f32x4x2_t neg;
+    for (int i = 0; i < 2; ++i) {
+      neg.val[i] = vec_neg(reg.val[i]);
+    }
+
+    FP32Vec8 neg_x(neg);
+    FP32Vec8 e = neg_x.exp();  // exp(-x)
+
+    f32x4x2_t denom;
+    for (int i = 0; i < 2; ++i) {
+      denom.val[i] = vec_add(one, e.reg.val[i]);
+    }
+
+    FP32Vec8 denom_vec(denom);
+    FP32Vec8 one_vec(1.0f);
+
+    return one_vec / denom_vec;
+  }
+
+  // Tanh-based GELU:
+  // gelu(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x^3)))
+  FP32Vec8 gelu_tanh() const {
+    const __vector float k_s2pi = vec_splats(0.7978845608028654f);  // √(2/π)
+    const __vector float k_0_0447 = vec_splats(0.044715f);
+
+    f32x4x2_t x2, x3, inner;
+    for (int i = 0; i < 2; ++i) {
+      __vector float x = reg.val[i];
+      x2.val[i] = vec_mul(x, x);                            // x^2
+      x3.val[i] = vec_mul(x2.val[i], x);                    // x^3
+      __vector float t = vec_madd(k_0_0447, x3.val[i], x);  // x + 0.044715*x^3
+      inner.val[i] = vec_mul(k_s2pi, t);                    // √(2/π)*(...)
+    }
+
+    FP32Vec8 inner_vec(inner);
+    FP32Vec8 t = inner_vec.tanh();  // tanh part
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    FP32Vec8 x_vec(*this);
+    return x_vec * half_vec * (one_vec + t);
+  }
+
+  // Erf-based GELU:
+  // gelu(x) = 0.5 * x * (1 + erf(x / √2))
+  FP32Vec8 gelu_erf() const {
+    const __vector float inv_sqrt2 = vec_splats(0.7071067811865476f);  // 1/√2
+    FP32Vec8 x_vec(*this);
+
+    f32x4x2_t scaled;
+    for (int i = 0; i < 2; ++i) {
+      scaled.val[i] = vec_mul(reg.val[i], inv_sqrt2);
+    }
+    FP32Vec8 x_scaled(scaled);
+
+    FP32Vec8 erf_x = x_scaled.er();
+
+    FP32Vec8 one_vec(1.0f);
+    FP32Vec8 half_vec(0.5f);
+
+    return x_vec * half_vec * (one_vec + erf_x);
+  }
+
+  // Elementwise reciprocal: 1/x (scalar per lane, for correctness)
+  FP32Vec8 rcp() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / in.values[i];
+    }
+    return FP32Vec8(out.reg);
+  }
+
+  // Elementwise rsqrt(x) = 1 / sqrt(x) (scalar per lane, for correctness)
+  FP32Vec8 rsqrt() const {
+    AliasReg in, out;
+    in.reg = reg;
+
+    for (int i = 0; i < VEC_ELEM_NUM; ++i) {
+      out.values[i] = 1.0f / std::sqrt(in.values[i]);
+    }
+    return FP32Vec8(out.reg);
   }
 
   FP32Vec8 operator*(const FP32Vec8& b) const {
@@ -316,10 +524,11 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   }
 
   explicit FP32Vec16(const BF16Vec16& v) {
-    reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
-    reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
-    reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
-    reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
+    // On big-endian s390x, place BF16 first to get correct byte order
+    reg.val[0] = (__vector float)vec_mergeh(v.reg.val[0], zero);
+    reg.val[1] = (__vector float)vec_mergel(v.reg.val[0], zero);
+    reg.val[2] = (__vector float)vec_mergeh(v.reg.val[1], zero);
+    reg.val[3] = (__vector float)vec_mergel(v.reg.val[1], zero);
   }
 
   explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
@@ -376,6 +585,23 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
     return result;
   }
 
+  FP32Vec16 max(const FP32Vec16& b) const {
+    return FP32Vec16(f32x4x4_t({vec_max(reg.val[0], b.reg.val[0]),
+                                vec_max(reg.val[1], b.reg.val[1]),
+                                vec_max(reg.val[2], b.reg.val[2]),
+                                vec_max(reg.val[3], b.reg.val[3])}));
+  }
+
+  float reduce_max() const {
+    AliasReg ar;
+    ar.reg = reg;
+    float result = ar.values[0];
+    unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) {
+      if (ar.values[i] > result) result = ar.values[i];
+    });
+    return result;
+  }
+
   void save(float* ptr) const {
     vec_xst(reg.val[0], 0, ptr);
     vec_xst(reg.val[1], 16, ptr);
@@ -402,15 +628,14 @@ struct VecType<c10::BFloat16> {
   using vec_type = BF16Vec8;
 };
 
+// On s390x, FP16 (Half) is not natively supported, use FP32 vectors instead
+using FP16Vec16 = FP32Vec16;
+
 template <typename T>
 void storeFP32(float v, T* ptr) {
   *ptr = v;
 }
 
-inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
-  acc = acc + a * b;
-}
-
 namespace c10 {
 struct BFloat16 {
   uint16_t value;  // Assume BFloat16 is defined as a struct containing a 16-bit
@@ -429,6 +654,79 @@ inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
   #define __VEC_CLASS_FP_NAN (1 << 6)
 #endif
 
+// Optimized FMA (Fused Multiply-Add) implementations using IBM Z vector
+// intrinsics
+
+// FP32Vec4 FMA: acc = acc + (a * b) or equivalently acc = fma(a, b, acc)
+FORCE_INLINE void fma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_madd(a.reg, b.reg, acc.reg);
+}
+
+// FP32Vec8 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+// FP32Vec16 FMA: acc = acc + (a * b)
+FORCE_INLINE void fma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_madd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_madd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_madd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_madd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Multiply-Subtract: acc = acc - (a * b)
+FORCE_INLINE void fms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_msub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void fms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void fms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_msub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_msub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_msub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_msub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Add: acc = -(a * b) + acc
+FORCE_INLINE void nfma(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmadd(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfma(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfma(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmadd(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmadd(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmadd(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmadd(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
+// Negative Multiply-Subtract: acc = -(a * b) - acc
+FORCE_INLINE void nfms(FP32Vec4& acc, const FP32Vec4& a, const FP32Vec4& b) {
+  acc.reg = vec_nmsub(a.reg, b.reg, acc.reg);
+}
+
+FORCE_INLINE void nfms(FP32Vec8& acc, const FP32Vec8& a, const FP32Vec8& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+}
+
+FORCE_INLINE void nfms(FP32Vec16& acc, const FP32Vec16& a, const FP32Vec16& b) {
+  acc.reg.val[0] = vec_nmsub(a.reg.val[0], b.reg.val[0], acc.reg.val[0]);
+  acc.reg.val[1] = vec_nmsub(a.reg.val[1], b.reg.val[1], acc.reg.val[1]);
+  acc.reg.val[2] = vec_nmsub(a.reg.val[2], b.reg.val[2], acc.reg.val[2]);
+  acc.reg.val[3] = vec_nmsub(a.reg.val[3], b.reg.val[3], acc.reg.val[3]);
+}
+
 const static __vector unsigned char omask = {2,  3,  6,  7,  10, 11, 14, 15,
                                              18, 19, 22, 23, 26, 27, 30, 31};
 const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
@@ -441,13 +739,24 @@ const static __vector unsigned int one = {1, 1, 1, 1};
 inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
   __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel1 =
       vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+
   reg = (__vector signed short)vec_perm(inp0, inp1, omask);
 }
 
@@ -456,6 +765,22 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
   __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
   __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
   __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
+  __vector unsigned int lsb0 = inp0 >> sh16;
+  __vector unsigned int lsb1 = inp1 >> sh16;
+  __vector unsigned int lsb2 = inp2 >> sh16;
+  __vector unsigned int lsb3 = inp3 >> sh16;
+  lsb0 = lsb0 & one;
+  lsb1 = lsb1 & one;
+  lsb2 = lsb2 & one;
+  lsb3 = lsb3 & one;
+  __vector unsigned int rnd0 = lsb0 + bias;
+  __vector unsigned int rnd1 = lsb1 + bias;
+  __vector unsigned int rnd2 = lsb2 + bias;
+  __vector unsigned int rnd3 = lsb3 + bias;
+  inp0 = inp0 + rnd0;
+  inp1 = inp1 + rnd1;
+  inp2 = inp2 + rnd2;
+  inp3 = inp3 + rnd3;
   int cc;
   __vector __bool int sel0 =
       vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
@@ -465,15 +790,164 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
       vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
   __vector __bool int sel3 =
       vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
-  inp0 = vec_sel(inp0, nan, sel0) >> sh16;
-  inp1 = vec_sel(inp1, nan, sel1) >> sh16;
-  inp2 = vec_sel(inp2, nan, sel2) >> sh16;
-  inp3 = vec_sel(inp3, nan, sel3) >> sh16;
+  inp0 = vec_sel(inp0, nan, sel0);
+  inp1 = vec_sel(inp1, nan, sel1);
+  inp2 = vec_sel(inp2, nan, sel2);
+  inp3 = vec_sel(inp3, nan, sel3);
+  inp0 = inp0 >> sh16;
+  inp1 = inp1 >> sh16;
+  inp2 = inp2 >> sh16;
+  inp3 = inp3 >> sh16;
+
   reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
   reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
 }
 
-inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
+// 1D softmax over `n` elements in `input`, writes result to `output`.
+// Uses FP32Vec8 for main body, scalar tail handling.
+// Requirement: n > 0
+FORCE_INLINE void softmax_fp32vec8(float* output, const float* input, int n) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: find max ----------
+  float max_val = -std::numeric_limits<float>::infinity();
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 v(input + i);
+    FP32Vec8::AliasReg ar;
+    ar.reg = v.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      if (ar.values[j] > max_val) max_val = ar.values[j];
+    }
+  }
+  for (; i < n; ++i) {
+    if (input[i] > max_val) max_val = input[i];
+  }
+
+  // ---------- Pass 2: compute exp(x - max) and sum ----------
+  float sum = 0.0f;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = input[i + j] - max_val;
+    }
+
+    FP32Vec8 v(tmp);
+    FP32Vec8 e = v.exp();
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = e.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      output[i + j] = ar.values[j];
+      sum += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float x = input[i] - max_val;
+    float ex = std::exp(x);  // scalar tail
+    output[i] = ex;
+    sum += ex;
+  }
+
+  // ---------- Pass 3: normalize ----------
+  float inv_sum = 1.0f / sum;
+  i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    float tmp[FP32Vec8::VEC_ELEM_NUM];
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      tmp[j] = output[i + j] * inv_sum;
+    }
+    FP32Vec8 v(tmp);
+    v.save(output + i);
+  }
+
+  for (; i < n; ++i) {
+    output[i] *= inv_sum;
+  }
+}
+
+// 1D RMSNorm kernel:
+//   input:  x[0..n-1]
+//   weight: w[0..n-1] (gamma), may be nullptr
+//   output: y[i] = x[i] * inv_rms * (weight[i] if weight != nullptr else 1)
+//   eps: small epsilon for numerical stability
+FORCE_INLINE void rmsnorm_fp32vec8(float* output, const float* input,
+                                   const float* weight, int n, float eps) {
+  if (n <= 0) return;
+
+  // ---------- Pass 1: compute sum of squares ----------
+  float sum_sq = 0.0f;
+  int i = 0;
+
+  for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+    FP32Vec8 x_vec(input + i);
+
+    FP32Vec8 sq = x_vec * x_vec;
+
+    FP32Vec8::AliasReg ar;
+    ar.reg = sq.reg;
+    for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+      sum_sq += ar.values[j];
+    }
+  }
+
+  // Tail
+  for (; i < n; ++i) {
+    float v = input[i];
+    sum_sq += v * v;
+  }
+
+  float mean_sq = sum_sq / static_cast<float>(n);
+  float inv_rms = 1.0f / std::sqrt(mean_sq + eps);
+
+  // ---------- Pass 2: scale (and apply weight if given) ----------
+  const float inv_rms_f = inv_rms;
+  i = 0;
+
+  if (weight) {
+    // with gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+
+      float wtmp[FP32Vec8::VEC_ELEM_NUM];
+      for (int j = 0; j < FP32Vec8::VEC_ELEM_NUM; ++j) {
+        wtmp[j] = weight[i + j];
+      }
+      FP32Vec8 w_vec(wtmp);
+
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec * w_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f * weight[i];
+    }
+  } else {
+    // without gamma
+    for (; i + FP32Vec8::VEC_ELEM_NUM <= n; i += FP32Vec8::VEC_ELEM_NUM) {
+      FP32Vec8 x_vec(input + i);
+      FP32Vec8 scale_vec(inv_rms_f);
+      FP32Vec8 y = x_vec * scale_vec;
+      y.save(output + i);
+    }
+
+    for (; i < n; ++i) {
+      output[i] = input[i] * inv_rms_f;
+    }
+  }
+}
+
+// Prefetch data to cache for better memory access performance
+FORCE_INLINE void prefetch(const void* addr) {
+  __builtin_prefetch(addr, 0, 3);  // 0=read, 3=high temporal locality
+}
 
 };  // namespace vec_op
 
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 7ddf028e6e13..6f51277f7844 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -104,6 +104,8 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
   explicit FP16Vec16(bool, void* ptr)
       : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
 
+  explicit FP16Vec16(const c10::Half v) : reg(_mm256_set1_epi16(v.x)) {}
+
   explicit FP16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
@@ -141,6 +143,8 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
   explicit BF16Vec16(bool, void* ptr)
       : reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
 
+  explicit BF16Vec16(const c10::BFloat16 v) : reg(_mm256_set1_epi16(v.x)) {}
+
   explicit BF16Vec16(const FP32Vec16&);
 
   void save(void* ptr) const { _mm256_storeu_si256((__m256i*)ptr, reg); }
@@ -350,6 +354,22 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   explicit FP32Vec16(__m512 data) : reg(data) {}
 
+  // de-pack 4 bit values
+  explicit FP32Vec16(int64_t value, const FP32Vec16& lut) {
+    int64_t mask_0 = 0x0F0F0F0F0F0F0F0F;
+    int64_t mask_1 = 0xF0F0F0F0F0F0F0F0;
+    int64_t value_0 = value & mask_0;
+    int64_t value_1 = value & mask_1;
+    __m128i vec_0 = _mm_movpi64_epi64((__m64)value_0);
+    __m128i vec_1 = _mm_movpi64_epi64((__m64)value_1);
+    vec_0 = _mm_cvtepu8_epi16(vec_0);
+    vec_1 = _mm_cvtepu8_epi16(vec_1);
+    vec_1 = _mm_slli_epi16(vec_1, 4);
+    __m128i vec = _mm_or_si128(vec_0, vec_1);
+    __m512i vec_i32 = _mm512_cvtepu8_epi32(vec);
+    reg = _mm512_permutexvar_ps(vec_i32, lut.reg);
+  }
+
   explicit FP32Vec16(const FP32Vec4& data)
       : reg((__m512)_mm512_inserti32x4(
             _mm512_inserti32x4(
@@ -426,14 +446,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
 
   float get_last_elem() const { return _mm512_cvtss_f32(reg); }
 
-  template <int group_size>
-  float reduce_sub_sum(int idx) {
-    static_assert(VEC_ELEM_NUM % group_size == 0);
-    constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
-    __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
-    return _mm512_mask_reduce_add_ps(mask, reg);
-  }
-
   void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); }
 
   void save(float* ptr, const int elem_num) const {
@@ -755,6 +767,25 @@ inline void non_temporal_save(BF16Vec16& vec, void* ptr) {
 inline void non_temporal_save(FP32Vec16& vec, void* ptr) {
   _mm512_stream_ps((float*)ptr, vec.reg);
 }
+
+static void interleave_save(const BF16Vec16& vec0, const BF16Vec16& vec1,
+                            void* ptr) {
+  __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
+  __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
+  vec_1 = _mm512_slli_epi32(vec_1, 16);
+  vec_0 = _mm512_or_si512(vec_0, vec_1);
+  _mm512_storeu_epi32(ptr, vec_0);
+}
+
+static void interleave_save(const FP16Vec16& vec0, const FP16Vec16& vec1,
+                            void* ptr) {
+  __m512i vec_0 = _mm512_cvtepu16_epi32(vec0.reg);
+  __m512i vec_1 = _mm512_cvtepu16_epi32(vec1.reg);
+  vec_1 = _mm512_slli_epi32(vec_1, 16);
+  vec_0 = _mm512_or_si512(vec_0, vec_1);
+  _mm512_storeu_epi32(ptr, vec_0);
+}
+
 #endif
 
 inline void mem_barrier() { _mm_mfence(); }
diff --git a/csrc/cpu/cpu_wna16.cpp b/csrc/cpu/cpu_wna16.cpp
new file mode 100644
index 000000000000..816d195506e5
--- /dev/null
+++ b/csrc/cpu/cpu_wna16.cpp
@@ -0,0 +1,402 @@
+#include "cpu_types.hpp"
+#include "scratchpad_manager.h"
+#include "utils.hpp"
+
+#ifdef CPU_CAPABILITY_AMXBF16
+  #include "cpu/micro_gemm/cpu_micro_gemm_amx.hpp"
+#endif
+#include "cpu/micro_gemm/cpu_micro_gemm_vec.hpp"
+
+#define VLLM_DISPATCH_CASE_16B_TYPES(...)                 \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
+
+#define VLLM_DISPATCH_16B_TYPES(TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_16B_TYPES(__VA_ARGS__))
+
+template <typename T>
+void print_logits(const char* name, T* ptr, int32_t row, int32_t col,
+                  int32_t stride) {
+  std::stringstream ss;
+  ss << std::fixed << std::setprecision(5) << name << ": [\n";
+  auto* curr_logits_buffer = ptr;
+  for (int32_t m = 0; m < row; ++m) {
+    for (int32_t n = 0; n < col; ++n) {
+      ss << curr_logits_buffer[n] << ", ";
+    }
+    ss << "\n";
+    curr_logits_buffer += stride;
+  }
+  ss << "]\n";
+  std::printf("%s", ss.str().c_str());
+}
+
+namespace {
+using cpu_utils::ISA;
+using cpu_utils::VecTypeTrait;
+
+template <typename scalar_t, ISA isa, bool has_zp, bool use_desc_act>
+class Dequantizer4b {
+ public:
+  constexpr static int32_t pack_num = 32 / 4;
+  using scalar_vec_t = typename VecTypeTrait<scalar_t>::vec_t;
+
+ public:
+  static void dequant(int32_t* __restrict__ q_weight,
+                      scalar_t* __restrict__ weight,
+                      scalar_t* __restrict__ scales,
+                      int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
+                      const int64_t scales_stride, const int64_t zeros_stride,
+                      const int32_t k_size, const int32_t group_size) {
+    vec_op::FP32Vec16 lut;
+    if constexpr (has_zp) {
+      // AWQ
+      alignas(64) static const float LUT[16] = {
+          0.0f, 1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,
+          8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f};
+      lut = vec_op::FP32Vec16(LUT);
+    } else {
+      // GPTQ
+      alignas(64) static const float LUT[16] = {
+          -8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f,
+          0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f};
+      lut = vec_op::FP32Vec16(LUT);
+    }
+
+    // per 64-bits elem contains 16 output channels
+    int64_t* __restrict__ curr_q_weight = reinterpret_cast<int64_t*>(q_weight);
+    int64_t* __restrict__ curr_zeros = reinterpret_cast<int64_t*>(zeros);
+    scalar_t* __restrict__ curr_weight = weight;
+    scalar_t* __restrict__ curr_scale = scales;
+    vec_op::FP32Vec16 scale_0;
+    vec_op::FP32Vec16 scale_1;
+    vec_op::FP32Vec16 zero_0;
+    vec_op::FP32Vec16 zero_1;
+    int32_t group_counter = 0;
+    for (int32_t k_idx = 0; k_idx < k_size; k_idx += 2) {
+      int64_t qwb_0 = *curr_q_weight;
+      int64_t qwb_1 = *(curr_q_weight + 1);
+      vec_op::FP32Vec16 wb_0(qwb_0, lut);
+      vec_op::FP32Vec16 wb_1(qwb_1, lut);
+
+      if constexpr (!use_desc_act) {
+        if (group_counter == 0) {
+          scale_0 = vec_op::FP32Vec16(scalar_vec_t(curr_scale));
+          scale_1 = vec_op::FP32Vec16(scale_0);
+          curr_scale += scales_stride;
+
+          if constexpr (has_zp) {
+            zero_0 = vec_op::FP32Vec16(*curr_zeros, lut);
+            zero_1 = vec_op::FP32Vec16(zero_0);
+            curr_zeros += zeros_stride / 2;
+          }
+        }
+      } else {
+        int32_t g_idx_0 = g_idx[k_idx];
+        int32_t g_idx_1 = g_idx[k_idx + 1];
+        scale_0 = vec_op::FP32Vec16(
+            scalar_vec_t(curr_scale + g_idx_0 * scales_stride));
+        scale_1 = vec_op::FP32Vec16(
+            scalar_vec_t(curr_scale + g_idx_1 * scales_stride));
+        if constexpr (has_zp) {
+          zero_0 = vec_op::FP32Vec16(*(curr_zeros + g_idx_0 * zeros_stride / 2),
+                                     lut);
+          zero_1 = vec_op::FP32Vec16(*(curr_zeros + g_idx_1 * zeros_stride / 2),
+                                     lut);
+        }
+      }
+
+      if constexpr (has_zp) {
+        wb_0 = wb_0 - zero_0;
+        wb_1 = wb_1 - zero_1;
+      }
+
+      wb_0 = wb_0 * scale_0;
+      wb_1 = wb_1 * scale_1;
+
+      scalar_vec_t output_vec_0(wb_0);
+      scalar_vec_t output_vec_1(wb_1);
+
+      // AMX needs to interlave K elements to pack as 32 bits
+      if constexpr (isa == ISA::AMX) {
+        vec_op::interleave_save(output_vec_0, output_vec_1, curr_weight);
+      } else {
+        output_vec_0.save(curr_weight);
+        output_vec_1.save(curr_weight + 16);
+      }
+
+      // update
+      curr_q_weight += 2;
+      curr_weight += 32;
+      if constexpr (!use_desc_act) {
+        group_counter += 2;
+        if (group_counter == group_size) {
+          group_counter = 0;
+        }
+      }
+    }
+  }
+};
+};  // namespace
+
+template <typename scalar_t, typename dequantizer_t, typename gemm_t>
+void cpu_gemm_wna16_impl(
+    scalar_t* __restrict__ input, int32_t* __restrict__ q_weight,
+    scalar_t* __restrict__ output, scalar_t* __restrict__ scales,
+    int32_t* __restrict__ zeros, int32_t* __restrict__ g_idx,
+    scalar_t* __restrict__ bias, const int32_t m_size, const int32_t n_size,
+    const int32_t k_size, const int64_t input_stride,
+    const int64_t output_stride, const int64_t scales_group_stride,
+    const int64_t zeros_group_stride, const int32_t group_num,
+    const int32_t group_size, const int64_t pack_factor) {
+  constexpr int32_t gemm_n_tile_size = gemm_t::NSize;
+  constexpr int32_t gemm_m_tile_size = gemm_t::MaxMSize;
+  constexpr int32_t n_block_size = 16;
+  static_assert(gemm_n_tile_size % n_block_size == 0);
+  const int32_t thread_num = omp_get_max_threads();
+
+  // a simple schedule policy, just to hold more B tiles in L2 and make sure
+  // each thread has tasks
+  const int32_t n_partition_size = [&]() {
+    const int64_t cache_size = cpu_utils::get_l2_size();
+    int64_t ps_cache_limit = cache_size / (k_size * sizeof(scalar_t));
+    int64_t ps_thread_limit = n_size / thread_num;
+    ps_cache_limit =
+        std::max((ps_cache_limit / gemm_n_tile_size) * gemm_n_tile_size,
+                 (int64_t)gemm_n_tile_size);
+    ps_thread_limit =
+        std::max((ps_thread_limit / gemm_n_tile_size) * gemm_n_tile_size,
+                 (int64_t)gemm_n_tile_size);
+    return std::min(ps_cache_limit, ps_thread_limit);
+  }();
+  const int32_t task_num = (n_size + n_partition_size - 1) / n_partition_size;
+
+  // get buffer size
+  const int64_t b_buffer_size =
+      (((n_partition_size * k_size * sizeof(scalar_t) + 63) / 64) * 64);
+  const int64_t c_buffer_size =
+      (((gemm_m_tile_size * gemm_n_tile_size * sizeof(float) + 63) / 64) * 64);
+  const int64_t b_buffer_offset = 0;
+  const int64_t c_buffer_offset = b_buffer_size;
+  const int64_t buffer_size = b_buffer_size + c_buffer_size;
+  DNNLScratchPadManager::get_dnnl_scratchpad_manager()->realloc(buffer_size *
+                                                                thread_num);
+
+  alignas(64) cpu_utils::Counter counter;
+  cpu_utils::Counter* counter_ptr = &counter;
+
+#pragma omp parallel for schedule(static, 1)
+  for (int32_t thread_id = 0; thread_id < thread_num; ++thread_id) {
+    scalar_t* __restrict__ b_buffer = nullptr;
+    float* __restrict__ c_buffer = nullptr;
+    {
+      uint8_t* buffer_ptr = DNNLScratchPadManager::get_dnnl_scratchpad_manager()
+                                ->get_data<uint8_t>() +
+                            thread_id * buffer_size;
+      b_buffer = reinterpret_cast<scalar_t*>(buffer_ptr + b_buffer_offset);
+      c_buffer = reinterpret_cast<float*>(buffer_ptr + c_buffer_offset);
+    }
+
+    const int64_t q_weight_block_stride = n_block_size / pack_factor * k_size;
+    const int64_t b_buffer_block_stride = n_block_size * k_size;
+    const int32_t zeros_block_stride = n_block_size / pack_factor;
+
+    gemm_t gemm;
+
+    for (;;) {
+      int32_t task_id = counter_ptr->acquire_counter();
+
+      if (task_id >= task_num) {
+        break;
+      }
+
+      const int32_t n_start_idx = task_id * n_partition_size;
+      const int32_t n_block_start_idx = n_start_idx / n_block_size;
+      const int32_t n_num = std::min(n_partition_size, n_size - n_start_idx);
+      const int32_t n_block_num = n_num / n_block_size;
+      // std::printf("thread_id: %d, task_id: %d, n_start_idx: %d, n_num: %d\n",
+      // thread_id, task_id, n_start_idx, n_num);
+
+      // dequant weight
+      {
+        int32_t* __restrict__ curr_q_weight =
+            q_weight + n_block_start_idx * q_weight_block_stride;
+        scalar_t* __restrict__ curr_b_buffer = b_buffer;
+        scalar_t* __restrict__ curr_scales = scales + n_start_idx;
+        int32_t* __restrict__ curr_zeros = zeros + n_start_idx / pack_factor;
+        for (int32_t block_idx = 0; block_idx < n_block_num; ++block_idx) {
+          dequantizer_t::dequant(curr_q_weight, curr_b_buffer, curr_scales,
+                                 curr_zeros, g_idx, scales_group_stride,
+                                 zeros_group_stride, k_size, group_size);
+
+          // if (block_idx == 0 && n_start_idx == 0) {
+          //     print_logits("depacked weight", curr_b_buffer, k_size,
+          //     n_block_size, n_block_size);
+          // }
+
+          // update
+          curr_q_weight += q_weight_block_stride;
+          curr_b_buffer += b_buffer_block_stride;
+          curr_scales += n_block_size;
+          curr_zeros += zeros_block_stride;
+        }
+      }
+
+      // compute loop
+      {
+        const int32_t n_tile_num = n_num / gemm_n_tile_size;
+        scalar_t* __restrict__ curr_input = input;
+        scalar_t* __restrict__ init_bias = bias;
+        if (bias != nullptr) {
+          init_bias += n_start_idx;
+        }
+        scalar_t* __restrict__ init_output = output + n_start_idx;
+        for (int32_t m_idx = 0; m_idx < m_size; m_idx += gemm_m_tile_size) {
+          const int32_t curr_m_size =
+              std::min(gemm_m_tile_size, m_size - m_idx);
+          scalar_t* __restrict__ curr_b_buffer = b_buffer;
+          scalar_t* __restrict__ curr_bias = init_bias;
+          scalar_t* __restrict__ curr_output = init_output;
+          for (int32_t n_tile_idx = 0; n_tile_idx < n_tile_num; ++n_tile_idx) {
+            gemm.gemm(curr_input, curr_b_buffer, c_buffer, curr_m_size, k_size,
+                      input_stride, b_buffer_block_stride, gemm_n_tile_size,
+                      false);
+
+            if (bias != nullptr) {
+              cpu_micro_gemm::bias_epilogue<gemm_n_tile_size>(
+                  c_buffer, curr_output, curr_bias, curr_m_size,
+                  gemm_n_tile_size, output_stride);
+              curr_bias += gemm_n_tile_size;
+            } else {
+              cpu_micro_gemm::default_epilogue<gemm_n_tile_size>(
+                  c_buffer, curr_output, curr_m_size, gemm_n_tile_size,
+                  output_stride);
+            }
+
+            curr_b_buffer +=
+                b_buffer_block_stride * (gemm_n_tile_size / n_block_size);
+            curr_output += gemm_n_tile_size;
+          }
+          curr_input += gemm_m_tile_size * input_stride;
+          init_output += gemm_m_tile_size * output_stride;
+        }
+      }
+    }
+  }
+}
+
+void cpu_gemm_wna16(
+    const torch::Tensor& input,  // [M, K]
+    const torch::Tensor&
+        q_weight,           // [N / 16, K * 16 / pack_factor], packed as int32
+    torch::Tensor& output,  // [M, N]
+    const torch::Tensor& scales,  // [group_num, N]
+    const std::optional<torch::Tensor>&
+        zeros,  // [group_num, N / pack_factor], packed as int32
+    const std::optional<torch::Tensor>& g_idx,  // [K]
+    const std::optional<torch::Tensor>& bias,   // [N]
+    const int64_t pack_factor, const std::string& isa_hint) {
+  using cpu_utils::ISA;
+  TORCH_CHECK_EQ(pack_factor, 8);  // only supports 4bits
+  const int32_t a_m_size = input.size(0);
+  const int32_t a_k_size = input.size(1);
+  const int64_t a_m_stride = input.stride(0);
+  const int32_t b_n_size = q_weight.size(0) * 16;
+  TORCH_CHECK_EQ(a_k_size % 32, 0);
+  TORCH_CHECK_EQ(b_n_size % 32, 0);
+  const int32_t group_num = scales.size(0);
+  const int32_t group_size = a_k_size / group_num;
+  TORCH_CHECK_EQ(group_size % 2, 0);
+  const int64_t scales_group_stride = scales.stride(0);
+  const int64_t output_m_stride = output.stride(0);
+
+  bool has_zp = zeros.has_value();
+  bool use_desc_act = g_idx.has_value();
+  TORCH_CHECK(!(has_zp && use_desc_act));
+
+  ISA isa = [&]() {
+    if (isa_hint == "amx") {
+      return ISA::AMX;
+    } else if (isa_hint == "vec") {
+      return ISA::VEC;
+    } else {
+      TORCH_CHECK(false, "unsupported isa hint: " + isa_hint);
+    }
+  }();
+
+  int32_t* zeros_ptr = has_zp ? zeros->data_ptr<int32_t>() : nullptr;
+  const int64_t zeros_group_stride = has_zp ? zeros->stride(0) : 0;
+  int32_t* g_idx_ptr = use_desc_act ? g_idx->data_ptr<int32_t>() : nullptr;
+
+  VLLM_DISPATCH_16B_TYPES(input.scalar_type(), "cpu_gemm_wna16", [&]() {
+    if (isa == ISA::AMX) {
+      using gemm_t = cpu_micro_gemm::MicroGemm<ISA::AMX, scalar_t>;
+      if (has_zp) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, true, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+      if (use_desc_act) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, true>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      } else {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::AMX, false, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+    } else if (isa == ISA::VEC) {
+      using gemm_t = cpu_micro_gemm::MicroGemm<ISA::VEC, scalar_t>;
+      if (has_zp) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, true, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+      if (use_desc_act) {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, true>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      } else {
+        using dequantizer_t = Dequantizer4b<scalar_t, ISA::VEC, false, false>;
+        cpu_gemm_wna16_impl<scalar_t, dequantizer_t, gemm_t>(
+            input.data_ptr<scalar_t>(), q_weight.data_ptr<int32_t>(),
+            output.data_ptr<scalar_t>(), scales.data_ptr<scalar_t>(), zeros_ptr,
+            g_idx_ptr, bias.has_value() ? bias->data_ptr<scalar_t>() : nullptr,
+            a_m_size, b_n_size, a_k_size, a_m_stride, output_m_stride,
+            scales_group_stride, zeros_group_stride, group_num, group_size,
+            pack_factor);
+        return;
+      }
+    }
+  });
+}
diff --git a/csrc/cpu/dnnl_helper.cpp b/csrc/cpu/dnnl_helper.cpp
index 02a8072ccf30..cfb6e78cba9a 100644
--- a/csrc/cpu/dnnl_helper.cpp
+++ b/csrc/cpu/dnnl_helper.cpp
@@ -396,9 +396,9 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args)
     : DNNLMatMulPrimitiveHandler(
           static_cast<DNNLMatMulPrimitiveHandler::Args>(args), args.ab_type),
       m_size_cache_(nullptr) {
-  assert(ab_type_ == dnnl::memory::data_type::f32 ||
-         ab_type_ == dnnl::memory::data_type::bf16 ||
-         ab_type_ == dnnl::memory::data_type::f16);
+  assert(b_type_ == dnnl::memory::data_type::f32 ||
+         b_type_ == dnnl::memory::data_type::bf16 ||
+         b_type_ == dnnl::memory::data_type::f16);
 
   dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_,
                                    {b_k_stride_, b_n_stride_});
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
new file mode 100644
index 000000000000..87a019773a89
--- /dev/null
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_amx.hpp
@@ -0,0 +1,245 @@
+#ifndef CPU_MICRO_GEMM_AMX_HPP
+#define CPU_MICRO_GEMM_AMX_HPP
+#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
+
+namespace cpu_micro_gemm {
+namespace {
+// AMX specific
+constexpr static int64_t AMX_TILE_ROW_BYTES = 64;
+constexpr static int64_t AMX_TILE_ROW_NUM = 16;
+constexpr static int64_t AMX_TILE_BYTES = AMX_TILE_ROW_BYTES * AMX_TILE_ROW_NUM;
+
+typedef struct __tile_config {
+  uint8_t palette_id = 1;
+  uint8_t start_row = 0;
+  uint8_t reserved_0[14] = {0};
+  uint16_t colsb[16] = {0};
+  uint8_t rows[16] = {0};
+} __tilecfg;
+
+// 2-2-4 pattern, for 16 < m <= 32
+// TILE 0, 1: load A matrix, row num should be 16, m - 16
+// TILE 2, 3: load B matrix, row num should be 16
+// TILE 4, 5, 6, 7: store results C matrix, row num should be 16, 16, m - 16, m
+// - 16
+template <typename scalar_t>
+class TileGemm224 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm224");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm224");
+  }
+};
+
+template <>
+class TileGemm224<c10::BFloat16> {
+ public:
+  using scalar_t = c10::BFloat16;
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
+    c10::BFloat16* __restrict__ a_tile_1 = a_ptr + lda * AMX_TILE_ROW_NUM;
+    const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
+
+    // B is always packed as 16 output channels block
+    c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
+    c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
+    const int32_t b_tile_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_4 = c_ptr;
+    float* __restrict__ c_tile_5 =
+        c_tile_4 + AMX_TILE_ROW_BYTES / sizeof(float);
+    float* __restrict__ c_tile_6 = c_ptr + AMX_TILE_ROW_NUM * ldc;
+    float* __restrict__ c_tile_7 =
+        c_tile_6 + AMX_TILE_ROW_BYTES / sizeof(float);
+    const int32_t c_tile_stride = ldc * sizeof(float);
+
+    if (accum_c) {
+      _tile_loadd(4, c_tile_4, c_tile_stride);
+      _tile_loadd(5, c_tile_5, c_tile_stride);
+      _tile_loadd(6, c_tile_6, c_tile_stride);
+      _tile_loadd(7, c_tile_7, c_tile_stride);
+    } else {
+      _tile_zero(4);
+      _tile_zero(5);
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_tile_stride);
+      _tile_dpbf16ps(4, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_tile_stride);
+      _tile_dpbf16ps(5, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_dpbf16ps(6, 1, 2);
+      _tile_dpbf16ps(7, 1, 3);
+
+      // update ptrs
+      a_tile_0 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      a_tile_1 += AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      b_tile_2 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    _tile_stored(4, c_tile_4, c_tile_stride);
+    _tile_stored(5, c_tile_5, c_tile_stride);
+    _tile_stored(6, c_tile_6, c_tile_stride);
+    _tile_stored(7, c_tile_7, c_tile_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    const int32_t m_0 = AMX_TILE_ROW_NUM;
+    const int32_t m_1 = m - AMX_TILE_ROW_NUM;
+    config.rows[0] = m_0;
+    config.rows[1] = m_1;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = m_0;
+    config.rows[5] = m_0;
+    config.rows[6] = m_1;
+    config.rows[7] = m_1;
+    _tile_loadconfig(&config);
+  }
+};
+
+// 1-2-2 pattern, for 0 < m <= 16
+// TILE 0, (1): load A matrix, use extra 1 tile for prefetch, row num should be
+// m, m
+// TILE 2, 3, (4, 5): load B matrix, use extra 2 tiles for prefetch, row
+// num should be 16
+// TILE 6, 7, (6, 7): store results C matrix, row num should be
+// m
+template <typename scalar_t>
+class TileGemm122 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm122");
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    TORCH_CHECK(false, "Unsupported data type for TileGemm122");
+  }
+};
+
+template <>
+class TileGemm122<c10::BFloat16> {
+ public:
+  using scalar_t = c10::BFloat16;
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    c10::BFloat16* __restrict__ a_tile_0 = a_ptr;
+    c10::BFloat16* __restrict__ a_tile_1 =
+        a_ptr + AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+    const int64_t a_tile_stride = lda * sizeof(c10::BFloat16);
+
+    c10::BFloat16* __restrict__ b_tile_2 = b_ptr;
+    c10::BFloat16* __restrict__ b_tile_3 = b_ptr + b_n_group_stride;
+    c10::BFloat16* __restrict__ b_tile_4 =
+        b_tile_2 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    c10::BFloat16* __restrict__ b_tile_5 =
+        b_tile_3 + AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    int64_t b_stride = AMX_TILE_ROW_BYTES;
+
+    float* __restrict__ c_tile_6 = c_ptr;
+    float* __restrict__ c_tile_7 = c_ptr + AMX_TILE_ROW_BYTES / sizeof(float);
+    int64_t c_stride = ldc * sizeof(float);
+
+    const int32_t k_times = k / (AMX_TILE_ROW_NUM * 4 / sizeof(c10::BFloat16));
+    const int32_t k_group_times = k_times / 2;
+    const bool has_tail = (k_times % 2 == 1);
+
+    if (accum_c) {
+      _tile_loadd(6, c_tile_6, c_stride);
+      _tile_loadd(7, c_tile_7, c_stride);
+    } else {
+      _tile_zero(6);
+      _tile_zero(7);
+    }
+
+    for (int32_t k = 0; k < k_group_times; ++k) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+      _tile_loadd(1, a_tile_1, a_tile_stride);
+      _tile_stream_loadd(4, b_tile_4, b_stride);
+      _tile_dpbf16ps(6, 1, 4);
+      _tile_stream_loadd(5, b_tile_5, b_stride);
+      _tile_dpbf16ps(7, 1, 5);
+
+      // update ptrs
+      a_tile_0 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      a_tile_1 += 2 * AMX_TILE_ROW_BYTES / sizeof(c10::BFloat16);
+      b_tile_2 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_3 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_4 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+      b_tile_5 += 2 * AMX_TILE_BYTES / sizeof(c10::BFloat16);
+    }
+
+    if (has_tail) {
+      _tile_loadd(0, a_tile_0, a_tile_stride);
+      _tile_stream_loadd(2, b_tile_2, b_stride);
+      _tile_dpbf16ps(6, 0, 2);
+      _tile_stream_loadd(3, b_tile_3, b_stride);
+      _tile_dpbf16ps(7, 0, 3);
+    }
+
+    _tile_stored(6, c_tile_6, c_stride);
+    _tile_stored(7, c_tile_7, c_stride);
+  }
+
+  FORCE_INLINE static void init_tile_config(int32_t m, __tilecfg& config) {
+    config.rows[0] = m;
+    config.rows[1] = m;
+    config.rows[2] = AMX_TILE_ROW_NUM;
+    config.rows[3] = AMX_TILE_ROW_NUM;
+    config.rows[4] = AMX_TILE_ROW_NUM;
+    config.rows[5] = AMX_TILE_ROW_NUM;
+    config.rows[6] = m;
+    config.rows[7] = m;
+    _tile_loadconfig(&config);
+  }
+};
+}  // namespace
+
+// Gemm kernel uses AMX, requires B matrix to be packed
+template <typename scalar_t>
+class MicroGemm<cpu_utils::ISA::AMX, scalar_t> {
+ public:
+  static constexpr int32_t MaxMSize = 32;
+  static constexpr int32_t NSize = 32;
+
+ public:
+  MicroGemm() : curr_m_(-1) {
+    vec_op::unroll_loop<int, 8>([&](int i) { amx_tile_config_.colsb[i] = 64; });
+  }
+
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    if (m > AMX_TILE_ROW_NUM) {
+      if (m != curr_m_) {
+        curr_m_ = m;
+        TileGemm224<scalar_t>::init_tile_config(m, amx_tile_config_);
+      }
+      TileGemm224<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+    } else {
+      if (m != curr_m_) {
+        curr_m_ = m;
+        TileGemm122<scalar_t>::init_tile_config(m, amx_tile_config_);
+      }
+      TileGemm122<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+    }
+  }
+
+ private:
+  alignas(64) __tilecfg amx_tile_config_;
+  int32_t curr_m_;
+};
+
+}  // namespace cpu_micro_gemm
+
+#endif
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
new file mode 100644
index 000000000000..784da55a420e
--- /dev/null
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_impl.hpp
@@ -0,0 +1,91 @@
+#ifndef CPU_MICRO_GEMM_IMPL_HPP
+#define CPU_MICRO_GEMM_IMPL_HPP
+#include "cpu/utils.hpp"
+#include "cpu/cpu_types.hpp"
+
+namespace cpu_micro_gemm {
+#define DEFINE_CPU_MICRO_GEMM_PARAMS                                        \
+  scalar_t *__restrict__ a_ptr, scalar_t *__restrict__ b_ptr,               \
+      float *__restrict__ c_ptr, const int32_t m, const int32_t k,          \
+      const int64_t lda, const int64_t b_n_group_stride, const int64_t ldc, \
+      const bool accum_c
+
+#define CPU_MICRO_GEMM_PARAMS \
+  a_ptr, b_ptr, c_ptr, m, k, lda, b_n_group_stride, ldc, accum_c
+
+template <cpu_utils::ISA isa, typename scalar_t>
+class MicroGemm {
+ public:
+  static constexpr int32_t MaxMSize = 16;
+  static constexpr int32_t NSize = 16;
+
+ public:
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TORCH_CHECK(false, "Unimplemented MicroGemm.");
+  }
+};
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void default_epilogue(float* __restrict__ c_ptr,
+                                   scalar_t* __restrict__ d_ptr,
+                                   const int32_t m, const int64_t ldc,
+                                   const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+
+  float* __restrict__ curr_c = c_ptr;
+  scalar_t* __restrict__ curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* __restrict__ curr_c_iter = curr_c;
+    scalar_t* __restrict__ curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_size / 16>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      scalar_vec_t c_vec(c_vec_fp32);
+      c_vec.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
+
+template <int32_t n_size, typename scalar_t>
+FORCE_INLINE void bias_epilogue(float* __restrict__ c_ptr,
+                                scalar_t* __restrict__ d_ptr,
+                                scalar_t* __restrict__ bias_ptr,
+                                const int32_t m, const int64_t ldc,
+                                const int64_t ldd) {
+  using scalar_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+  static_assert(n_size % 16 == 0);
+  constexpr int32_t n_group_num = n_size / 16;
+  static_assert(n_group_num <= 16);
+
+  vec_op::FP32Vec16 bias_vecs[n_group_num];
+  scalar_t* __restrict__ curr_bias = bias_ptr;
+  vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t i) {
+    scalar_vec_t vec(curr_bias);
+    bias_vecs[i] = vec_op::FP32Vec16(vec);
+    curr_bias += 16;
+  });
+
+  float* __restrict__ curr_c = c_ptr;
+  scalar_t* __restrict__ curr_d = d_ptr;
+  for (int32_t i = 0; i < m; ++i) {
+    float* __restrict__ curr_c_iter = curr_c;
+    scalar_t* __restrict__ curr_d_iter = curr_d;
+    vec_op::unroll_loop<int32_t, n_group_num>([&](int32_t n_g_idx) {
+      vec_op::FP32Vec16 c_vec_fp32(curr_c_iter);
+      c_vec_fp32 = c_vec_fp32 + bias_vecs[n_g_idx];
+      scalar_vec_t c_vec(c_vec_fp32);
+      c_vec.save(curr_d_iter);
+      curr_c_iter += 16;
+      curr_d_iter += 16;
+    });
+    curr_c += ldc;
+    curr_d += ldd;
+  }
+}
+}  // namespace cpu_micro_gemm
+
+#endif
diff --git a/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
new file mode 100644
index 000000000000..3985c2f2e5fe
--- /dev/null
+++ b/csrc/cpu/micro_gemm/cpu_micro_gemm_vec.hpp
@@ -0,0 +1,115 @@
+#ifndef CPU_MICRO_GEMM_VEC_HPP
+#define CPU_MICRO_GEMM_VEC_HPP
+#include "cpu/micro_gemm/cpu_micro_gemm_impl.hpp"
+
+namespace cpu_micro_gemm {
+namespace {
+// 8-2-16 pattern, 8 regs for A, 2 regs for B, 16 regs for C, [8, K] @ [k, 32]
+template <typename scalar_t>
+class TileGemm82 {
+ public:
+  FORCE_INLINE static void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    switch (m) {
+      case 1:
+        gemm_micro<1>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 2:
+        gemm_micro<2>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 3:
+        gemm_micro<3>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 4:
+        gemm_micro<4>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 5:
+        gemm_micro<5>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 6:
+        gemm_micro<6>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 7:
+        gemm_micro<7>(CPU_MICRO_GEMM_PARAMS);
+        break;
+      case 8:
+        gemm_micro<8>(CPU_MICRO_GEMM_PARAMS);
+        break;
+    }
+  }
+
+  template <int32_t M>
+  static void gemm_micro(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    static_assert(0 < M <= 8);
+    using load_vec_t = typename cpu_utils::VecTypeTrait<scalar_t>::vec_t;
+
+    scalar_t* __restrict__ curr_b_0 = b_ptr;
+    scalar_t* __restrict__ curr_b_1 = b_ptr + b_n_group_stride;
+    float* __restrict__ curr_c_0 = c_ptr;
+    float* __restrict__ curr_c_1 = c_ptr + 16;
+
+    vec_op::FP32Vec16 c_regs[M * 2];
+    if (accum_c) {
+      float* __restrict__ curr_m_c_0 = curr_c_0;
+      float* __restrict__ curr_m_c_1 = curr_c_1;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        c_regs[i * 2] = vec_op::FP32Vec16(curr_m_c_0);
+        c_regs[i * 2 + 1] = vec_op::FP32Vec16(curr_m_c_1);
+
+        // update
+        curr_m_c_0 += ldc;
+        curr_m_c_1 += ldc;
+      });
+    }
+
+    scalar_t* __restrict__ curr_a = a_ptr;
+    for (int32_t k_idx = 0; k_idx < k; ++k_idx) {
+      load_vec_t b_0_reg(curr_b_0);
+      vec_op::FP32Vec16 fp32_b_0_reg(b_0_reg);
+      load_vec_t b_1_reg(curr_b_1);
+      vec_op::FP32Vec16 fp32_b_1_reg(b_1_reg);
+
+      scalar_t* __restrict__ curr_m_a = curr_a;
+      vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+        scalar_t v = *curr_m_a;
+        load_vec_t a_reg_original(v);
+        vec_op::FP32Vec16 a_reg(a_reg_original);
+        c_regs[i * 2] = c_regs[i * 2] + a_reg * fp32_b_0_reg;
+        c_regs[i * 2 + 1] = c_regs[i * 2 + 1] + a_reg * fp32_b_1_reg;
+
+        // update
+        curr_m_a += lda;
+      });
+
+      // update
+      curr_a += 1;
+      curr_b_0 += 16;
+      curr_b_1 += 16;
+    }
+
+    vec_op::unroll_loop<int32_t, M>([&](int32_t i) {
+      c_regs[i * 2].save(curr_c_0);
+      c_regs[i * 2 + 1].save(curr_c_1);
+
+      // update
+      curr_c_0 += ldc;
+      curr_c_1 += ldc;
+    });
+  }
+};
+}  // namespace
+
+// Gemm kernel uses vector instructions, requires B matrix to be packed
+template <typename scalar_t>
+class MicroGemm<cpu_utils::ISA::VEC, scalar_t> {
+ public:
+  static constexpr int32_t MaxMSize = 8;
+  static constexpr int32_t NSize = 32;
+
+ public:
+  void gemm(DEFINE_CPU_MICRO_GEMM_PARAMS) {
+    TileGemm82<scalar_t>::gemm(CPU_MICRO_GEMM_PARAMS);
+  }
+};
+}  // namespace cpu_micro_gemm
+
+#endif
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
index 5e2aa7069256..e0e3ef71b485 100644
--- a/csrc/cpu/torch_bindings.cpp
+++ b/csrc/cpu/torch_bindings.cpp
@@ -100,6 +100,16 @@ void cpu_attention_with_kv_cache(
     const torch::Tensor& scheduler_metadata,
     const std::optional<torch::Tensor>& s_aux);
 
+// Note: just for avoiding importing errors
+void placeholder_op() { TORCH_CHECK(false, "Unimplemented"); }
+
+void cpu_gemm_wna16(const torch::Tensor& input, const torch::Tensor& q_weight,
+                    torch::Tensor& output, const torch::Tensor& scales,
+                    const std::optional<torch::Tensor>& zeros,
+                    const std::optional<torch::Tensor>& g_idx,
+                    const std::optional<torch::Tensor>& bias,
+                    const int64_t pack_factor, const std::string& isa_hint);
+
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
 
@@ -162,7 +172,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Quantization
 #if defined(__AVX512F__) || (defined(__aarch64__) && !defined(__APPLE__)) || \
     defined(__powerpc64__)
-  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
   // Helper function to release oneDNN handlers
   ops.def("release_dnnl_matmul_handler(int handler) -> ()",
           &release_dnnl_matmul_handler);
@@ -198,15 +207,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
       "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
-      "Tensor? azp) -> ()",
-      {stride_tag});
+      "Tensor? azp) -> ()");
   ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
 
   // Compute int8 quantized tensor and scaling factor
   ops.def(
       "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
-      "Tensor!? azp) -> ()",
-      {stride_tag});
+      "Tensor!? azp) -> ()");
   ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
            &dynamic_scaled_int8_quant);
 #endif
@@ -275,6 +282,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "sliding_window_left, SymInt sliding_window_right, Tensor block_table, "
       "float softcap, Tensor sheduler_metadata, Tensor? s_aux) -> ()",
       &cpu_attention_with_kv_cache);
+
+  // placeholders
+  ops.def("static_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_scaled_fp8_quant() -> ()", placeholder_op);
+  ops.def("dynamic_per_token_scaled_fp8_quant() -> ()", placeholder_op);
+
+  // WNA16
+#if defined(__AVX512F__)
+  ops.def(
+      "cpu_gemm_wna16(Tensor input, Tensor q_weight, Tensor(a2!) output, "
+      "Tensor scales, Tensor? zeros, Tensor? g_idx, Tensor? bias, SymInt "
+      "pack_factor, str isa_hint) -> ()");
+  ops.impl("cpu_gemm_wna16", torch::kCPU, &cpu_gemm_wna16);
+#endif
 }
 
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
diff --git a/csrc/cpu/utils.cpp b/csrc/cpu/utils.cpp
index c5a48352e308..3dacfc7b2b7a 100644
--- a/csrc/cpu/utils.cpp
+++ b/csrc/cpu/utils.cpp
@@ -45,31 +45,55 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
   // Memory node binding
   if (numa_available() != -1) {
     int mem_node_id = numa_node_of_cpu(omp_cpu_ids.front());
-    // Verify all CPUs are on the same NUMA node
-    for (size_t i = 1; i < omp_cpu_ids.size(); ++i) {
-      int node_id = numa_node_of_cpu(omp_cpu_ids[i]);
-      TORCH_CHECK(node_id == mem_node_id, "CPU ", omp_cpu_ids[i],
-                  " is on NUMA node ", node_id, ", but CPU ",
-                  omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
-                  ". All CPUs should be on the same NUMA node for optimal "
-                  "performance. Memory will be bound to NUMA node ",
-                  mem_node_id, ".");
+    std::set<int> node_ids;
+    for (const auto& cpu_id : omp_cpu_ids) {
+      int node_id = numa_node_of_cpu(cpu_id);
+      if (node_id != -1) {
+        node_ids.insert(node_id);
+      }
+      if (node_id != mem_node_id) {
+        TORCH_WARN("CPU ", cpu_id, " is on NUMA node ", node_id, ", but CPU ",
+                   omp_cpu_ids.front(), " is on NUMA node ", mem_node_id,
+                   ". All CPUs should be on the same NUMA node for optimal "
+                   "performance. Memory will be bound to NUMA node ",
+                   mem_node_id, ".");
+      }
     }
-    bitmask* mask = numa_parse_nodestring(std::to_string(mem_node_id).c_str());
-    bitmask* src_mask = numa_get_membind();
-
-    int pid = getpid();
+    // Concatenate all node_ids into a single comma-separated string
+    if (!node_ids.empty()) {
+      std::string node_ids_str;
+      for (const int node_id : node_ids) {
+        if (!node_ids_str.empty()) {
+          node_ids_str += ",";
+        }
+        node_ids_str += std::to_string(node_id);
+      }
 
-    // move all existing pages to the specified numa node.
-    *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
-    int page_num = numa_migrate_pages(pid, src_mask, mask);
-    if (page_num == -1) {
-      TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
+      bitmask* mask = numa_parse_nodestring(node_ids_str.c_str());
+      bitmask* src_mask = numa_get_membind();
+
+      int pid = getpid();
+
+      if (mask && src_mask) {
+        // move all existing pages to the specified numa node.
+        *(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
+        int page_num = numa_migrate_pages(pid, src_mask, mask);
+        if (page_num == -1) {
+          TORCH_WARN("numa_migrate_pages failed. errno: " +
+                     std::to_string(errno));
+        }
+
+        // restrict memory allocation node.
+        numa_set_membind(mask);
+        numa_set_strict(1);
+
+        numa_free_nodemask(mask);
+        numa_free_nodemask(src_mask);
+      } else {
+        TORCH_WARN("numa_parse_nodestring or numa_get_membind failed. errno: " +
+                   std::to_string(errno));
+      }
     }
-
-    // restrict memory allocation node.
-    numa_set_membind(mask);
-    numa_set_strict(1);
   }
 
   // OMP threads binding
diff --git a/csrc/cpu/utils.hpp b/csrc/cpu/utils.hpp
new file mode 100644
index 000000000000..d3def306b806
--- /dev/null
+++ b/csrc/cpu/utils.hpp
@@ -0,0 +1,73 @@
+#ifndef UTILS_HPP
+#define UTILS_HPP
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <unistd.h>
+
+#if defined(__APPLE__)
+  #include <sys/sysctl.h>
+#endif
+
+#include "cpu_types.hpp"
+
+namespace cpu_utils {
+enum class ISA { AMX, VEC };
+
+template <typename T>
+struct VecTypeTrait {
+  using vec_t = void;
+};
+
+template <>
+struct VecTypeTrait<float> {
+  using vec_t = vec_op::FP32Vec16;
+};
+
+#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
+template <>
+struct VecTypeTrait<c10::BFloat16> {
+  using vec_t = vec_op::BF16Vec16;
+};
+#endif
+
+template <>
+struct VecTypeTrait<c10::Half> {
+  using vec_t = vec_op::FP16Vec16;
+};
+
+struct Counter {
+  std::atomic<int64_t> counter;
+  char _padding[56];
+
+  Counter() : counter(0) {}
+
+  void reset_counter() { counter.store(0); }
+
+  int64_t acquire_counter() { return counter++; }
+};
+
+inline int64_t get_l2_size() {
+  static int64_t size = []() {
+#if defined(__APPLE__)
+    // macOS doesn't have _SC_LEVEL2_CACHE_SIZE. Use sysctlbyname.
+    int64_t l2_cache_size = 0;
+    size_t len = sizeof(l2_cache_size);
+    if (sysctlbyname("hw.l2cachesize", &l2_cache_size, &len, NULL, 0) == 0 &&
+        l2_cache_size > 0) {
+      return l2_cache_size >> 1;  // use 50% of L2 cache
+    }
+    // Fallback if sysctlbyname fails
+    return 128LL * 1024 >> 1;  // use 50% of 128KB
+#else
+    long l2_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
+    assert(l2_cache_size != -1);
+    return l2_cache_size >> 1;  // use 50% of L2 cache
+#endif
+  }();
+  return size;
+}
+}  // namespace cpu_utils
+
+#endif
diff --git a/csrc/cuda_view.cu b/csrc/cuda_view.cu
index 938bd4ab7fc6..9853fc942bab 100644
--- a/csrc/cuda_view.cu
+++ b/csrc/cuda_view.cu
@@ -22,15 +22,10 @@ torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor) {
   auto strides = cpu_tensor.strides();
   auto options = cpu_tensor.options().device(torch::kCUDA);
 
-  // from_blob signature: from_blob(void *data, IntArrayRef sizes, ..., Deleter,
-  // const TensorOptions &) Provide a no-op deleter. The CPU tensor holds the
-  // memory, so we don't free it here.
-  auto deleter = [](void*) {
-    // no-op, since the memory is owned by the original CPU tensor
-  };
-
+  // use default no-op deleter, since the memory is owned by the original CPU
+  // tensor
   torch::Tensor cuda_tensor =
-      torch::from_blob(device_ptr, sizes, strides, deleter, options);
+      torch::from_blob(device_ptr, sizes, strides, options);
 
   TORCH_CHECK(cuda_tensor.device().is_cuda(),
               "Resulting tensor is not on CUDA device");
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 9ae0ed975edd..e1d131e4a785 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -117,3 +117,24 @@
       break;                                  \
     }                                         \
   }
+
+#define VLLM_DISPATCH_RANK234(NUM_DIMS, ...)                                   \
+  switch (NUM_DIMS) {                                                          \
+    case 2: {                                                                  \
+      constexpr int tensor_rank = 2;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 3: {                                                                  \
+      constexpr int tensor_rank = 3;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    case 4: {                                                                  \
+      constexpr int tensor_rank = 4;                                           \
+      __VA_ARGS__();                                                           \
+      break;                                                                   \
+    }                                                                          \
+    default:                                                                   \
+      TORCH_CHECK(false, "Expects rank 2, 3 or 4 tensors but got ", NUM_DIMS); \
+  }
diff --git a/csrc/fused_qknorm_rope_kernel.cu b/csrc/fused_qknorm_rope_kernel.cu
index 83017250ebcd..baff8363162e 100644
--- a/csrc/fused_qknorm_rope_kernel.cu
+++ b/csrc/fused_qknorm_rope_kernel.cu
@@ -37,6 +37,16 @@
 
 #ifdef USE_ROCM
   #define FINAL_MASK 0xffffffffffffffffULL
+
+  #if defined(HIP_VERSION) && HIP_VERSION < 70000000
+// On ROCm versions before 7.0, __syncwarp isn't defined. The below
+// implementation is copy/pasted from the implementation in ROCm 7.0
+__device__ inline void __syncwarp() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "wavefront");
+  __builtin_amdgcn_wave_barrier();
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "wavefront");
+}
+  #endif
 #else
   #define FINAL_MASK 0xffffffff
 #endif
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 48771e4b3aff..dfc67b933cca 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -10,16 +10,38 @@
 namespace vllm {
 
 // TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t, int VEC_SIZE>
+template <typename scalar_t, int VEC_SIZE, int NUM_DIMS>
 __global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,          // [..., hidden_size]
-    const scalar_t* __restrict__ input,  // [..., hidden_size]
-    const int64_t input_stride,
+    scalar_t* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
+    const int64_t input_stride_d2,        // input.stride(-2)
+    const int64_t input_stride_d3,        // input.stride(-3)
+    const int64_t input_stride_d4,        // input.stride(-4)
+    const int64_t input_shape_d2,         // input.size(-2)
+    const int64_t input_shape_d3,         // input.size(-3)
     const scalar_t* __restrict__ weight,  // [hidden_size]
     const float epsilon, const int num_tokens, const int hidden_size) {
   __shared__ float s_variance;
   float variance = 0.0f;
-  const scalar_t* input_row = input + blockIdx.x * input_stride;
+  const scalar_t* input_row;
+  if constexpr (NUM_DIMS == 2) {
+    // 2D for layernorm normal case [batch_size, hidden]
+    input_row = input + blockIdx.x * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 3) {
+    // 3D for q/k norm [batch_size, num_heads, head_size]
+    int batch_idx = blockIdx.x / input_shape_d2;
+    int head_idx = blockIdx.x % input_shape_d2;
+    input_row =
+        input + batch_idx * input_stride_d3 + head_idx * input_stride_d2;
+  } else if constexpr (NUM_DIMS == 4) {
+    // 4D for transformers model_impl qk norm [batch, seq, head, head_dim]
+    int batch_idx = blockIdx.x / (input_shape_d3 * input_shape_d2);
+    int remaining = blockIdx.x % (input_shape_d3 * input_shape_d2);
+    int seq_idx = remaining / input_shape_d2;
+    int head_idx = remaining % input_shape_d2;
+    input_row = input + batch_idx * input_stride_d4 +
+                seq_idx * input_stride_d3 + head_idx * input_stride_d2;
+  }
 
   auto vec_op = [&variance](const vec_n_t<scalar_t, VEC_SIZE>& vec) {
 #pragma unroll
@@ -164,38 +186,44 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& weight,  // [hidden_size]
               double epsilon) {
   TORCH_CHECK(out.is_contiguous());
+  if (input.stride(-1) != 1) {
+    input = input.contiguous();
+  }
   TORCH_CHECK(input.stride(-1) == 1);
   TORCH_CHECK(weight.is_contiguous());
 
   int hidden_size = input.size(-1);
 
-  // We cannot just use `input.stride(-2)` if the tensor is not row-major.
-  // Instead, we use a 2d view to get the second-innermost stride.
-  // That way the dimensions (except the last one) can be arbitrarily permuted.
-  torch::Tensor input_view = input.view({-1, hidden_size});
-
-  int num_tokens = input_view.numel() / hidden_size;
-  int64_t input_stride = input_view.stride(-2);
+  int num_tokens = input.numel() / hidden_size;
+  int num_dims = input.dim();
+  int64_t input_stride_d2 = input.stride(-2);
+  int64_t input_stride_d3 = (num_dims >= 3) ? input.stride(-3) : 0;
+  int64_t input_stride_d4 = (num_dims >= 4) ? input.stride(-4) : 0;
+  int64_t input_shape_d2 = (num_dims >= 3) ? input.size(-2) : 0;
+  int64_t input_shape_d3 = (num_dims >= 4) ? input.size(-3) : 0;
 
   // For large num_tokens, use smaller blocks to increase SM concurrency.
   const int max_block_size = (num_tokens < 256) ? 1024 : 256;
   dim3 grid(num_tokens);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input_view));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(
-      input_view.scalar_type(), "rms_norm_kernel", [&] {
-        const int calculated_vec_size =
-            std::gcd(16 / sizeof(scalar_t), hidden_size);
-        const int block_size =
-            std::min(hidden_size / calculated_vec_size, max_block_size);
-        dim3 block(block_size);
-        VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
-          vllm::rms_norm_kernel<scalar_t, vec_size><<<grid, block, 0, stream>>>(
-              out.data_ptr<scalar_t>(), input_view.data_ptr<scalar_t>(),
-              input_stride, weight.data_ptr<scalar_t>(), epsilon, num_tokens,
-              hidden_size);
-        });
+  VLLM_DISPATCH_RANK234(num_dims, [&] {
+    VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+      const int calculated_vec_size =
+          std::gcd(16 / sizeof(scalar_t), hidden_size);
+      const int block_size =
+          std::min(hidden_size / calculated_vec_size, max_block_size);
+      dim3 block(block_size);
+      VLLM_DISPATCH_VEC_SIZE(calculated_vec_size, [&] {
+        vllm::rms_norm_kernel<scalar_t, vec_size, tensor_rank>
+            <<<grid, block, 0, stream>>>(
+                out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
+                input_stride_d2, input_stride_d3, input_stride_d4,
+                input_shape_d2, input_shape_d3, weight.data_ptr<scalar_t>(),
+                epsilon, num_tokens, hidden_size);
       });
+    });
+  });
 }
 
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
diff --git a/csrc/moe/marlin_moe_wna16/.gitignore b/csrc/moe/marlin_moe_wna16/.gitignore
index 77088552b85b..ba805f9250ec 100644
--- a/csrc/moe/marlin_moe_wna16/.gitignore
+++ b/csrc/moe/marlin_moe_wna16/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
\ No newline at end of file
+sm*_kernel_*.cu
+kernel_selector.h
diff --git a/csrc/moe/marlin_moe_wna16/generate_kernels.py b/csrc/moe/marlin_moe_wna16/generate_kernels.py
index be5b68cc53e6..88f1055337fd 100644
--- a/csrc/moe/marlin_moe_wna16/generate_kernels.py
+++ b/csrc/moe/marlin_moe_wna16/generate_kernels.py
@@ -4,134 +4,282 @@
 import itertools
 import os
 import subprocess
+import sys
 
 import jinja2
 
-FILE_HEAD = """
-// auto generated by generate.py
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
+
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
 // clang-format off
+""".lstrip()
 
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "kernel.h"
 #include "marlin_template.h"
 
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)
 
 TEMPLATE = (
     "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
     "{{s_type_id}}, "
     "{{threads}}, "
     "{{thread_m_blocks}}, "
     "{{thread_n_blocks}}, "
     "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
     "{{stages}}, "
     "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
     "( MARLIN_KERNEL_PARAMS );"
 )
 
-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
-]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-#   = 0 : act order case
-#   = -1 : channelwise quantization
-#   > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, -1, 1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # AWQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
 
 
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
         subprocess.call(["rm", "-f", filename])
 
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
 
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
-        all_template_str_list = []
+    result_dict = {}
 
-        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
-            # act order case only support gptq-int4 and gptq-int8
-            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
-            ]:
-                continue
-            if thread_configs[2] == 256:
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
-                if m_blocks <= 1 and thread_configs[0] != 128:
-                    continue
-                if m_blocks > 1 and thread_configs[0] != 64:
-                    continue
-
-            # we only support channelwise quantization and group_size == 128
-            # for fp8
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
-                continue
-            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
-            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
                 continue
-            # other quantization methods don't support group_size = 16
-            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
+            if "16" in a_type and "16" in c_type and a_type != c_type:
                 continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
+
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
 
-            k_blocks = thread_configs[0] // 16
-            n_blocks = thread_configs[1] // 16
-            threads = thread_configs[2]
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "false",
+                }
 
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
+                result_dict[(a_type, b_type, c_type)].append(config)
 
-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
+    kernel_selector_str = FILE_HEAD_COMMENT
 
+    for (a_type, b_type, c_type), config_list in result_dict.items():
+        all_template_str_list = []
+        for config in config_list:
+            s_type = config["s_type"]
             template_str = jinja2.Template(TEMPLATE).render(
-                scalar_t=c_dtype,
-                w_type_id=scalar_type + ".id()",
-                s_type_id=s_type + ".id()",
-                threads=threads,
-                thread_m_blocks=max(m_blocks, 1),
-                thread_n_blocks=n_blocks,
-                thread_k_blocks=k_blocks,
-                m_block_size_8=m_blocks == 0.5,
-                stages="pipe_stages",
-                group_blocks=group_blocks,
-                is_zp_float=False,
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
             )
-
             all_template_str_list.append(template_str)
 
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
+            )
+
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )
+
         file_content = FILE_HEAD + "\n\n"
         file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
 
         with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
             f.write(file_content)
 
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
 
 if __name__ == "__main__":
     remove_old_kernels()
diff --git a/csrc/moe/marlin_moe_wna16/kernel.h b/csrc/moe/marlin_moe_wna16/kernel.h
index 6190f7ee21ec..57f5a17932d4 100644
--- a/csrc/moe/marlin_moe_wna16/kernel.h
+++ b/csrc/moe/marlin_moe_wna16/kernel.h
@@ -11,8 +11,9 @@
   const int4 *__restrict__ A, const int4 *__restrict__ B,                     \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                         \
       const int4 *__restrict__ b_bias_ptr,                                    \
+      const float *__restrict__ a_scales_ptr,                                 \
       const int4 *__restrict__ scales_ptr,                                    \
-      const uint16_t *__restrict__ scale2_ptr,                                \
+      const uint16_t *__restrict__ global_scale_ptr,                          \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,         \
       const int32_t *__restrict__ sorted_token_ids_ptr,                       \
       const int32_t *__restrict__ expert_ids_ptr,                             \
@@ -20,12 +21,13 @@
       const float *__restrict__ topk_weights_ptr, int top_k,                  \
       bool mul_topk_weights, bool is_ep, int num_groups, int prob_m,          \
       int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
-      bool use_fp32_reduce, int max_shared_mem
+      bool use_fp32_reduce
 
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/moe/marlin_moe_wna16/marlin_template.h b/csrc/moe/marlin_moe_wna16/marlin_template.h
index dd86a9a5ba6e..5b6b2456b411 100644
--- a/csrc/moe/marlin_moe_wna16/marlin_template.h
+++ b/csrc/moe/marlin_moe_wna16/marlin_template.h
@@ -38,7 +38,7 @@ namespace MARLIN_NAMESPACE_NAME {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -49,9 +49,10 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
                                       // only works when thread_m_blocks == 1
           const int stages,  // number of stages for the async global->shared
                              // fetch pipeline
-          const int group_blocks,  // number of consecutive 16x16 blocks
-                                   // with a separate quantization scale
-          const bool is_zp_float   // is zero point of float16 type?
+          const bool has_act_order,  // whether act_order is enabled
+          const int group_blocks,    // number of consecutive 16x16 blocks
+                                     // with a separate quantization scale
+          const bool is_zp_float     // is zero point of float16 type?
           >
 __global__ void Marlin(
     const int4* __restrict__ A,  // fp16 input matrix of shape mxk
@@ -76,8 +77,8 @@ __global__ void Marlin(
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
     bool use_atomic_add,    // whether to use atomic add to reduce
-    bool use_fp32_reduce,   // whether to use fp32 global reduce
-    int max_shared_mem) {}
+    bool use_fp32_reduce    // whether to use fp32 global reduce
+) {}
 
 }  // namespace MARLIN_NAMESPACE_NAME
 
@@ -85,65 +86,148 @@ __global__ void Marlin(
 
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
-template <typename scalar_t>
-__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
-                           const typename ScalarType<scalar_t>::FragB& frag_b,
-                           typename ScalarType<scalar_t>::FragC& frag_c) {
+template <vllm::ScalarTypeId type_id, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id, int k_size = 16>
 __device__ inline void mma_trans(
-    const typename ScalarType<scalar_t>::FragA& a_frag,
-    const typename ScalarType<scalar_t>::FragB& frag_b,
-    const typename ScalarType<scalar_t>::FragB& frag_b2,
-    typename ScalarType<scalar_t>::FragC& frag_c) {
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
   const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
   float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
   } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1200
+      asm volatile(
+          "mma.sync.aligned.kind::f8f6f4.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  #else
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  #endif
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
-template <int count, typename scalar_t>
-__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+template <int count, vllm::ScalarTypeId type_id>
+__device__ inline void ldsm(typename MarlinScalarType<type_id>::FragA& frag_a,
                             const void* smem_ptr) {
   uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
@@ -167,47 +251,54 @@ __device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
 
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
-template <typename scalar_t>
-__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
-                             typename ScalarType<scalar_t>::FragS& frag_s,
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale(typename MarlinScalarType<type_id>::FragB& frag_b,
+                             typename MarlinScalarType<type_id>::FragS& frag_s,
                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_s)[i]);
   frag_b[0] = __hmul2(frag_b[0], s);
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id>
 __device__ inline void scale_and_sub(
-    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
-  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t s,
+    typename MarlinScalarType<type_id>::scalar_t zp) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s2 = MarlinScalarType<type_id>::num2num2(s);
+  scalar_t2 zp2 = MarlinScalarType<type_id>::num2num2(zp);
   frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
   frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
 }
 
-template <typename scalar_t>
-__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 zp =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+template <vllm::ScalarTypeId type_id>
+__device__ inline void sub_zp(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t2& frag_zp, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 zp = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_zp)[i]);
   frag_b[0] = __hsub2(frag_b[0], zp);
   frag_b[1] = __hsub2(frag_b[1], zp);
 }
 
 // Same as above, but for act_order (each K is multiplied individually)
-template <typename scalar_t>
-__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::FragS& frag_s_1,
-                              typename ScalarType<scalar_t>::FragS& frag_s_2,
-                              typename ScalarType<scalar_t>::FragS& frag_s_3,
-                              typename ScalarType<scalar_t>::FragS& frag_s_4,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale4(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragS& frag_s_1,
+    typename MarlinScalarType<type_id>::FragS& frag_s_2,
+    typename MarlinScalarType<type_id>::FragS& frag_s_3,
+    typename MarlinScalarType<type_id>::FragS& frag_s_4, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+
   scalar_t2 s_val_1_2;
   s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
   s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
@@ -221,12 +312,13 @@ __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
 }
 
 // Given 2 floats multiply by 2 scales (halves)
-template <typename scalar_t>
-__device__ inline void scale_float(float* c,
-                                   typename ScalarType<scalar_t>::FragS& s) {
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale_float(
+    float* c, typename MarlinScalarType<type_id>::FragS& s) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
   scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
-  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
-  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+  c[0] = __fmul_rn(c[0], MarlinScalarType<type_id>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], MarlinScalarType<type_id>::num2float(s_ptr[1]));
 }
 
 // Wait until barrier reaches `count`, then lock for current threadblock.
@@ -278,9 +370,10 @@ __device__ inline void wait_negative_and_add(int* lock) {
   __syncthreads();
 }
 
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -301,13 +394,18 @@ __global__ void Marlin(
     int4* __restrict__ C,        // fp16 output buffer of shape mxn
     int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ b_bias_ptr,
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
-                                              // only)
-    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
-                                      // (k/groupsize)x(n/pack_factor)
-    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    // float scales of input matrix, only used when is_a_8bit == true.
+    // shape (m,)
+    const float* __restrict__ a_scales_ptr,
+    // fp16 quantization scales. shape (k/groupsize, n)
+    const int4* __restrict__ scales_ptr,
+    // fp16 global scale (for nvfp4// only)
+    const uint16_t* __restrict__ global_scale_ptr,
+    // 4bit packed zero-points of shape
+    // (k/groupsize, n/pack_factor)
+    const int4* __restrict__ zp_ptr,
+    // int32 group indices of shape k
+    const int* __restrict__ g_idx,
     const int32_t* __restrict__ sorted_token_ids_ptr,        // moe sorted_ids
     const int32_t* __restrict__ expert_ids_ptr,              // moe expert ids
     const int32_t* __restrict__ num_tokens_past_padded_ptr,  // moe num tokens
@@ -321,9 +419,9 @@ __global__ void Marlin(
     int prob_k,             // reduction dimension k
     int* locks,             // extra global storage for barrier synchronization
     bool has_bias,
-    bool use_atomic_add,   // whether to use atomic add to reduce
-    bool use_fp32_reduce,  // whether to use fp32 global reduce
-    int max_shared_mem) {
+    bool use_atomic_add,  // whether to use atomic add to reduce
+    bool use_fp32_reduce  // whether to use fp32 global reduce
+) {
   // Each threadblock processes one "stripe" of the B matrix with (roughly) the
   // same size, which might involve multiple column "slices" (of width 16 *
   // `thread_n_blocks`). Stripes are defined as shown in the 3x3 matrix 5 SM
@@ -335,18 +433,37 @@ __global__ void Marlin(
   // ensures good utilization of all SMs for many kinds of shape and GPU
   // configurations, while requiring as few slow global cross-threadblock
   // reductions as possible.
-  using Dtype = ScalarType<scalar_t>;
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  using FragA = typename ScalarType<scalar_t>::FragA;
-  using FragB = typename ScalarType<scalar_t>::FragB;
-  using FragC = typename ScalarType<scalar_t>::FragC;
-  using FragS = typename ScalarType<scalar_t>::FragS;
-  using FragZP = typename ScalarType<scalar_t>::FragZP;
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 890
+  // FP8 computation is only supported for Ada Lovelace or newer architectures.
+  if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
+  #endif
+
+  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
+  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
+
+  using Adtype = MarlinScalarType<a_type_id>;
+  using Cdtype = MarlinScalarType<c_type_id>;
+
+  using scalar_t = typename MarlinScalarType<a_type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<a_type_id>::scalar_t2;
+  using scalar_32bit_t = typename MarlinScalarType<a_type_id>::scalar_32bit_t;
+
+  using c_scalar_t = typename MarlinScalarType<c_type_id>::scalar_t;
+  using c_scalar_t2 = typename MarlinScalarType<c_type_id>::scalar_t2;
+
+  using FragA = typename MarlinScalarType<a_type_id>::FragA;
+  using FragB = typename MarlinScalarType<a_type_id>::FragB;
+  using FragC = typename MarlinScalarType<a_type_id>::FragC;
+  using FragS = typename MarlinScalarType<c_type_id>::FragS;
+  using FragZP = typename MarlinScalarType<c_type_id>::FragZP;
 
   extern __shared__ int4 sh[];
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+  static constexpr auto a_type = vllm::ScalarType::from_id(a_type_id);
+  static constexpr auto b_type = vllm::ScalarType::from_id(b_type_id);
+  static constexpr auto c_type = vllm::ScalarType::from_id(c_type_id);
   static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (b_type == vllm::kFE2M1f) {
     static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
                   s_type == vllm::kFE8M0fnu && group_blocks == 2);
   } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
@@ -355,34 +472,37 @@ __global__ void Marlin(
     static_assert(s_type == vllm::kFloat16);
   }
 
-  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
-  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
-                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  constexpr bool is_a_8bit = a_type.size_bits() == 8;
+  if constexpr (!is_a_8bit) {
+    static_assert(std::is_same<scalar_t, c_scalar_t>::value);
+  }
+  constexpr bool has_zp = b_type == vllm::kU4 || b_type == vllm::kU8;
+  constexpr bool is_int_type = b_type == vllm::kU4 || b_type == vllm::kU8 ||
+                               b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                               b_type == vllm::kU4B8 || b_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      w_type == vllm::kFE4M3fn ||
-      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+      is_a_8bit || b_type == vllm::kFE4M3fn ||
+      b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
-      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+      has_zp && !is_zp_float && !(b_type == vllm::kU8);
 
-  scalar_t2 global_scale;
+  c_scalar_t2 global_scale;
 
   constexpr bool has_act_order = group_blocks == 0;
 
-  constexpr int pack_factor = 32 / w_type.size_bits();
+  constexpr int pack_factor = 32 / b_type.size_bits();
   static_assert(thread_m_blocks == 1 || !m_block_size_8);
-  constexpr int moe_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
   const int group_size =
       (!has_act_order && group_blocks == -1) ? prob_k : prob_k / num_groups;
   const int scales_expert_stride =
-      prob_n * prob_k / group_size / (w_type == vllm::kFE2M1f ? 16 : 8);
+      prob_n * prob_k / group_size / (b_type == vllm::kFE2M1f ? 16 : 8);
   const int zp_expert_stride =
       is_zp_float ? prob_n * prob_k / group_size / 8
                   : prob_n * prob_k / group_size / (pack_factor * 4);
   const int b_bias_expert_stride = prob_n / 8;
 
   // parallel: num valid moe blocks
-  int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
   int parallel = num_tokens_past_padded / moe_block_size;
   int num_valid_blocks = parallel;
   if (is_ep) {
@@ -395,7 +515,23 @@ __global__ void Marlin(
 
   int k_tiles = prob_k / 16 / thread_k_blocks;
   int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int global_mn_tiles = parallel * n_tiles;
+  int part2_mn_tiles = global_mn_tiles;
+  int part1_mn_iters = 0;
+  bool in_part2 = false;
+
+  // we use DP + two-tile SK here
+  // part1: DP
+  // part2: two-tile SK
+  // see https://github.com/vllm-project/vllm/pull/24722 for more details
+  if (global_mn_tiles > gridDim.x) {
+    part2_mn_tiles = global_mn_tiles % gridDim.x;
+    if (part2_mn_tiles * 3 <= gridDim.x) part2_mn_tiles += gridDim.x;
+    part1_mn_iters = (global_mn_tiles - part2_mn_tiles) / gridDim.x;
+  }
+
+  int iters = div_ceil(k_tiles * part2_mn_tiles, gridDim.x);
 
   if constexpr (!has_act_order && group_blocks != -1) {
     if (group_blocks >= thread_k_blocks) {
@@ -407,14 +543,15 @@ __global__ void Marlin(
     }
   }
 
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
+  int slice_row = 0;
+  int slice_col_par = blockIdx.x;
+  int slice_col;
+  int slice_iters =
+      k_tiles;  // number of threadblock tiles in the current slice
+  // total number of active threadblocks in the current slice
+  int slice_count = 1;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx = 0;
 
   int par_id = 0;
   int block_id = -1;
@@ -422,85 +559,89 @@ __global__ void Marlin(
   int old_expert_id = 0;
   int64_t B_expert_off = 0;
 
-  int4* sh_block_sorted_ids_int4 = sh;
+  float* sh_a_s = reinterpret_cast<float*>(sh);
+  int4* sh_block_sorted_ids_int4 = sh + (is_a_8bit ? (4 * thread_m_blocks) : 0);
   int4* sh_rd_block_sorted_ids_int4 =
       sh_block_sorted_ids_int4 + moe_block_size / 4;
   int4* sh_block_topk_weights_int4 =
       sh_rd_block_sorted_ids_int4 + moe_block_size / 4;
   // sh_block_topk_weights_int4 only need (moe_block_size / 4);
   // but we pad to align to 256 bytes
-  int4* sh_new =
-      sh_block_topk_weights_int4 + moe_block_size / 2 + moe_block_size;
+  int4* sh_new = sh_block_topk_weights_int4 + moe_block_size / 2;
   int32_t* sh_block_sorted_ids =
       reinterpret_cast<int*>(sh_block_sorted_ids_int4);
   int32_t* sh_rd_block_sorted_ids =
       reinterpret_cast<int*>(sh_rd_block_sorted_ids_int4);
-  scalar_t2* sh_block_topk_weights =
-      reinterpret_cast<scalar_t2*>(sh_block_topk_weights_int4);
+  c_scalar_t2* sh_block_topk_weights =
+      reinterpret_cast<c_scalar_t2*>(sh_block_topk_weights_int4);
 
   int32_t block_num_valid_tokens = 0;
   int32_t locks_off = 0;
 
   // We can easily implement parallel problem execution by just remapping
   // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    slice_col = slice_col_par % n_tiles;
-    par_id = slice_col_par / n_tiles;
-  }
-  if (parallel * n_tiles >= gridDim.x) {
-    // when parallel * n_tiles >= sms
+  if (part2_mn_tiles >= gridDim.x) {
+    // when part2_mn_tiles >= sms
     // then there are at most $sms$ conflict tile blocks
     locks_off = blockIdx.x;
   } else {
     locks_off = (iters * blockIdx.x) / k_tiles - 1;
   }
 
+  int prob_m_top_k = prob_m * top_k;
   // read moe block data given block_id
   // block_sorted_ids / block_num_valid_tokens / block_topk_weights
   auto read_moe_block_data = [&](int block_id) {
     block_num_valid_tokens = moe_block_size;
+
+    cp_async4_pred(sh_block_sorted_ids_int4 + threadIdx.x,
+                   reinterpret_cast<const int4*>(sorted_token_ids_ptr) +
+                       (block_id * moe_block_size / 4 + threadIdx.x),
+                   threadIdx.x < moe_block_size / 4);
+
+    cp_async_fence();
+    cp_async_wait<0>();
+
+    __syncthreads();
+
+    if (threadIdx.x >= threads - 32) {
+      constexpr int size_per_thread = div_ceil(moe_block_size, 32);
+      int lane_id = threadIdx.x - (threads - 32);
+
+      int local_count = 0;
   #pragma unroll
-    for (int i = 0; i < moe_block_size / 4; i++) {
-      int4 sorted_token_ids_int4 = reinterpret_cast<const int4*>(
-          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + i];
-      int* sorted_token_ids = reinterpret_cast<int*>(&sorted_token_ids_int4);
-  #pragma unroll
-      for (int j = 0; j < 4; j++) {
-        if (sorted_token_ids[j] >= prob_m * top_k) {
-          block_num_valid_tokens = i * 4 + j;
-          break;
+      for (int i = 0; i < size_per_thread; i++) {
+        int j = lane_id * size_per_thread + i;
+        if (j < moe_block_size) {
+          int idx = sh_block_sorted_ids[j];
+          if (idx < prob_m_top_k) local_count++;
         }
       }
-      if (block_num_valid_tokens != moe_block_size) break;
-    }
 
-    __syncthreads();
-    int tid4 = threadIdx.x / 4;
-    if (threadIdx.x % 4 == 0 && threadIdx.x < block_num_valid_tokens) {
-      sh_block_sorted_ids_int4[tid4] = reinterpret_cast<const int4*>(
-          sorted_token_ids_ptr)[block_id * moe_block_size / 4 + tid4];
+      block_num_valid_tokens = __reduce_add_sync(0xffffffff, local_count);
 
-  #pragma unroll
-      for (int i = 0; i < 4; i++)
-        sh_rd_block_sorted_ids[tid4 * 4 + i] =
-            sh_block_sorted_ids[tid4 * 4 + i] / top_k;
+      if (lane_id == 0)
+        reinterpret_cast<int*>(sh_new)[0] = block_num_valid_tokens;
+    }
+
+    if (threadIdx.x < moe_block_size) {
+      int idx = sh_block_sorted_ids[threadIdx.x];
+      sh_rd_block_sorted_ids[threadIdx.x] = idx / top_k;
 
       if (mul_topk_weights) {
-  #pragma unroll
-        for (int i = 0; i < 4; i++) {
-          int idx = tid4 * 4 + i;
-          idx = idx < block_num_valid_tokens ? idx : 0;
-          if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-            sh_block_topk_weights[idx] = __hmul2(
-                global_scale, Dtype::num2num2(Dtype::float2num(
-                                  topk_weights_ptr[sh_block_sorted_ids[idx]])));
-          } else {
-            sh_block_topk_weights[idx] = Dtype::num2num2(
-                Dtype::float2num(topk_weights_ptr[sh_block_sorted_ids[idx]]));
-          }
+        idx = idx < prob_m_top_k ? idx : 0;
+        c_scalar_t2 topk_weight_val =
+            Cdtype::num2num2(Cdtype::float2num(topk_weights_ptr[idx]));
+        if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+          topk_weight_val = __hmul2(topk_weight_val, global_scale);
         }
+        sh_block_topk_weights[threadIdx.x] = topk_weight_val;
       }
     }
+
+    __syncthreads();
+
+    block_num_valid_tokens = reinterpret_cast<int*>(sh_new)[0];
     __syncthreads();
   };
 
@@ -511,9 +652,8 @@ __global__ void Marlin(
 
     old_expert_id = expert_id;
     if (num_invalid_blocks > 0) {
-      int skip_count = block_id == -1 ? par_id : 0;
-      block_id++;
-      for (int i = block_id; i < num_tokens_past_padded / moe_block_size; i++) {
+      int skip_count = par_id;
+      for (int i = 0; i < num_tokens_past_padded / moe_block_size; i++) {
         expert_id = expert_ids_ptr[i];
         if (expert_id != -1) {
           if (skip_count == 0) {
@@ -528,9 +668,9 @@ __global__ void Marlin(
       expert_id = expert_ids_ptr[block_id];
     }
 
-    if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-      uint16_t val = scale2_ptr[expert_id];
-      global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+    if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      uint16_t val = global_scale_ptr[expert_id];
+      global_scale = Cdtype::num2num2(*reinterpret_cast<c_scalar_t*>(&val));
     }
 
     B_expert_off = expert_id * prob_n * prob_k / (pack_factor * 4);
@@ -550,10 +690,11 @@ __global__ void Marlin(
 
   // Compute all information about the current slice which is required for
   // synchronization.
-  auto init_slice = [&](bool first_init = false) {
+  bool first_init = true;
+  auto init_part2_slice = [&]() {
     slice_iters =
         iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters < 0 || slice_col_par >= part2_mn_tiles) slice_iters = 0;
     if (slice_iters == 0) return;
     if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
     slice_count = 1;
@@ -571,7 +712,7 @@ __global__ void Marlin(
         if (col_off > 0) slice_idx--;
       }
     }
-    if (parallel * n_tiles >= gridDim.x) {
+    if (part2_mn_tiles >= gridDim.x) {
       if (slice_count > 1 && slice_idx == slice_count - 1) {
         locks_off++;
       }
@@ -605,25 +746,61 @@ __global__ void Marlin(
       par_id++;
       update_next_moe_block_data();
     }
+    if (is_a_8bit && (first_init || slice_col == 0)) {
+      __syncthreads();
+      cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+                        &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+                        threadIdx.x < block_num_valid_tokens);
+    }
   };
 
-  update_next_moe_block_data();
-  init_slice(true);
+  auto init_part1_slice = [&]() {
+    if (part1_mn_iters) {
+      part1_mn_iters--;
+      par_id = slice_col_par / n_tiles;
+      slice_col = slice_col_par % n_tiles;
+      slice_iters = k_tiles;
+      update_next_moe_block_data();
+      if (is_a_8bit) {
+        __syncthreads();
+        cp_async1_ca_pred(&sh_a_s[threadIdx.x],
+                          &a_scales_ptr[sh_rd_block_sorted_ids[threadIdx.x]],
+                          threadIdx.x < block_num_valid_tokens);
+      }
+    }
+  };
+
+  auto init_slice = [&]() {
+    if (!in_part2 && !part1_mn_iters) {
+      in_part2 = true;
+      slice_col_par = (iters * blockIdx.x) / k_tiles;
+      slice_row = (iters * blockIdx.x) % k_tiles;
+      slice_col = (slice_col_par + global_mn_tiles - part2_mn_tiles) % n_tiles;
+      par_id = (slice_col_par + global_mn_tiles - part2_mn_tiles) / n_tiles;
+      update_next_moe_block_data();
+    }
+    if (!in_part2) {
+      init_part1_slice();
+    } else {
+      init_part2_slice();
+      first_init = false;
+    }
+  };
+
+  init_slice();
 
   // A sizes/strides
 
   // stride of the A matrix in global memory
-  int a_gl_stride = prob_k / 8;
+  int a_gl_stride = prob_k / (is_a_8bit ? 16 : 8);
   // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  constexpr int a_sh_stride = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // between subsequent accesses within a tile
   int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
   // between shared memory writes
   constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
   // within a shared memory tile
   constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
   // overall size of a tile
@@ -632,24 +809,25 @@ __global__ void Marlin(
   constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  int b_gl_stride = 16 * prob_n / (pack_factor * (is_a_8bit ? 2 : 4));
+  constexpr int b_sh_stride =
+      ((thread_n_blocks * 16) * 16 / pack_factor) / (is_a_8bit ? 2 : 4);
+  constexpr int b_thread_vecs = b_type.size_bits() == 4 ? 1 : 2;
   constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
 
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_stage =
+      b_sh_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
   // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  int s_gl_stride = prob_n / (b_type == vllm::kFE2M1f ? 16 : 8);
+  constexpr int s_sh_stride =
+      16 * thread_n_blocks / (b_type == vllm::kFE2M1f ? 16 : 8);
   constexpr int s_tb_groups =
       !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
-          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          ? thread_k_blocks / group_blocks
           : 1;
   constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
   int s_gl_rd_delta = s_gl_stride;
@@ -662,7 +840,8 @@ __global__ void Marlin(
   constexpr int act_s_max_num_groups = 32;
   int act_s_col_stride = 1;
   int act_s_col_warp_stride = act_s_col_stride * 8;
-  int tb_n_warps = thread_n_blocks / 4;
+
+  constexpr int tb_n_warps = thread_n_blocks / (is_a_8bit ? 2 : 4);
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
@@ -677,7 +856,6 @@ __global__ void Marlin(
   // Global A read index of current thread.
   int a_gl_rd_row = threadIdx.x / a_gl_rd_delta_o;
   int a_gl_rd_col = a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
-
   // Shared write index of current thread.
   int a_sh_wr = a_sh_stride * (threadIdx.x / a_gl_rd_delta_o) +
                 (threadIdx.x % a_gl_rd_delta_o);
@@ -685,17 +863,22 @@ __global__ void Marlin(
   int a_sh_rd =
       a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
       (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / tb_n_warps) * b_sh_wr_iters;
+
+  int b_gl_rd;
+  if (threads <= b_sh_stride) {
+    b_gl_rd = threadIdx.x;
+  } else {
+    b_gl_rd =
+        b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  }
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
-  b_gl_rd += b_sh_stride * slice_col;
+  b_gl_rd += B_expert_off + b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x * b_thread_vecs;
   auto b_sh_rd = threadIdx.x * b_thread_vecs;
+  b_sh_rd += b_sh_rd / b_sh_stride * (b_sh_stride * (b_sh_wr_iters - 1));
 
   // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
   int slice_k_start = tb_k * slice_row;
   int slice_k_finish = slice_k_start + tb_k * slice_iters;
   int slice_k_start_shared_fetch = slice_k_start;
@@ -706,58 +889,54 @@ __global__ void Marlin(
   if constexpr (!has_act_order) {
     if constexpr (group_blocks == -1) {
       s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-    } else {
-      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
-                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+    } else if constexpr (group_blocks >= thread_k_blocks) {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                 s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
     }
   }
   auto s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stage;
 
   // Zero-points
   int zp_gl_rd;
   if constexpr (has_zp) {
     if constexpr (group_blocks == -1) {
       zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
-    } else {
+    } else if constexpr (group_blocks >= thread_k_blocks) {
       zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                  zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                 threadIdx.x / zp_sh_stride) +
+                 zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
     }
   }
   auto zp_sh_wr = threadIdx.x;
-  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+  bool zp_sh_wr_pred = zp_sh_stage > 0 && threadIdx.x < zp_sh_stage;
 
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
   int s_sh_rd;
-  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
-    auto warp_id = threadIdx.x / 32;
-    int n_warps = thread_n_blocks / 4;
-    int warp_row = warp_id / n_warps;
-
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
-
+  if constexpr (is_a_8bit) {
+    s_sh_rd = 4 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 4);
   } else if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
   else if constexpr (group_blocks == -1 &&
                      (m_block_size_8 || (has_zp && !dequant_skip_flop)))
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 8;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) % 4;
 
   int bias_sh_rd;
   if constexpr (m_block_size_8) {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) / 8;
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   } else {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+    bias_sh_rd = (is_a_8bit ? 4 : 8) * ((threadIdx.x / 32) % tb_n_warps) +
                  (threadIdx.x % 32) % 4;
   }
 
@@ -773,12 +952,16 @@ __global__ void Marlin(
   if constexpr (has_zp) {
     if constexpr (is_zp_float) {
       if constexpr (group_blocks != -1) {
-        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                   (threadIdx.x % 32) / 4;
+        zp_sh_rd =
+            8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
       }
+    } else if (is_a_8bit) {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % tb_n_warps / 2) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     } else {
       zp_sh_rd = num_ints_per_thread * num_col_threads *
-                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                     ((threadIdx.x / 32) % tb_n_warps) +
                  num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     }
   }
@@ -805,18 +988,13 @@ __global__ void Marlin(
   for (int i = 0; i < b_sh_wr_iters; i++) {
   #pragma unroll
     for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[i][j] = transform_a(2 * i + a_sh_rd_delta_i * j + a_sh_rd);
   }
 
   // Since B-accesses have non-constant stride they have to be computed at
   // runtime; we break dependencies between subsequent accesses with a tile by
   // maintining multiple pointers (we have enough registers), a tiny
   // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
 
   // Shared memory storage for global fetch pipelines.
   constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
@@ -845,19 +1023,12 @@ __global__ void Marlin(
   static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
                 stages * b_sh_stage);
   int4* sh_a = sh_s + sh_s_size;
-  constexpr int shm_size_used = moe_block_size +
-                                stages * (g_idx_stage + zp_sh_stage) +
-                                sh_s_size + sh_b_red_bias_size;
-
-  // all remaining shared memory is used to cache A (input)
-  // sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
-  int sh_a_max_row =
-      ((max_shared_mem - 1024) / 16 - shm_size_used) / (thread_k_blocks * 2);
 
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
-  FragC frag_c[thread_m_blocks][4][2];
+  FragC frag_c[thread_m_blocks][is_a_8bit ? 2 : 4][2];
+  FragC frag_c_tmp[thread_m_blocks][is_a_8bit ? 2 : 4][2];
   FragS frag_s[2][4];  // No act-order
   FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
@@ -865,6 +1036,24 @@ __global__ void Marlin(
   FragZP frag_zp;                        // Zero-points in fp16
   FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
 
+  if constexpr (is_a_8bit && group_blocks != -1) {
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][0][g] = 0.0f;
+        }
+
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][1][g] = 0.0f;
+        }
+      }
+    }
+  }
+
   // Zero accumulators.
   auto zero_accums = [&]() {
   #pragma unroll
@@ -908,43 +1097,36 @@ __global__ void Marlin(
       }
     }
   };
-
   // Asynchronously fetch the next A, B and s tile from global to the next
   // shared memory pipeline location.
-  bool should_load_a = true;
-  int max_num_stage_groups =
-      ((sh_a_max_row - moe_block_size) / moe_block_size + 1) / stages;
-  max_num_stage_groups = max(max_num_stage_groups, 1);
-  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true,
-                             int pipe_a = 0) {
+  auto fetch_to_shared = [&](int pipe, int a_off, bool pred = true) {
     if (pred) {
-      if (should_load_a) {
-        int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+      int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
   #pragma unroll
-        for (int i = 0; i < a_sh_wr_iters; i++) {
-          int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
-          int64_t sorted_row = 0;
-          if (!m_block_size_8 || row < 8)
-            sorted_row = sh_rd_block_sorted_ids[row];
-          int64_t true_idx =
-              sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
-          cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
-                         row < block_num_valid_tokens);
-        }
+      for (int i = 0; i < a_sh_wr_iters; i++) {
+        int row = a_gl_rd_delta_i / a_gl_stride * i + a_gl_rd_row;
+        int64_t sorted_row = 0;
+        if (!m_block_size_8 || row < 8)
+          sorted_row = sh_rd_block_sorted_ids[row];
+        int64_t true_idx =
+            sorted_row * a_gl_stride + a_gl_rd_col + a_gl_rd_delta_o * a_off;
+        cp_async4_pred(&sh_a_stage[a_sh_wr_trans[i]], &A[true_idx],
+                       row < block_num_valid_tokens);
       }
 
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j],
-                    B_ptr[i] + j + B_expert_off);
-        }
+      for (int i = 0; i < (b_sh_wr_iters * b_thread_vecs); i++) {
+        constexpr int count = div_ceil(b_sh_stride, threads);
+        int b_gl_idx =
+            b_gl_rd + (i % count) * threads +
+            b_gl_stride * (i / count) * div_ceil(threads, b_sh_stride);
 
-        B_ptr[i] += b_gl_rd_delta_o;
+        cp_async4(&sh_b_stage[threads * i + threadIdx.x], &B[b_gl_idx]);
       }
 
+      b_gl_rd += b_gl_rd_delta_o;
+
       if constexpr (has_act_order) {
         // Fetch g_idx thread-block portion
         int full_pipe = a_off;
@@ -964,44 +1146,24 @@ __global__ void Marlin(
         if constexpr (group_blocks != -1) {
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
+          // Only fetch scales if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
+            s_gl_rd += s_gl_rd_delta * s_tb_groups;
           }
         }
 
         if constexpr (has_zp && group_blocks != -1) {
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch zero-points if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < zp_tb_groups; i++) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
-                          &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
+          // Only fetch zero points if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (zp_sh_wr_pred) {
+              cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
             }
+            zp_gl_rd += zp_gl_rd_delta * zp_tb_groups;
           }
         }
       }
@@ -1035,18 +1197,18 @@ __global__ void Marlin(
 
   // Load the next sub-tile from the current location in the shared memory pipe
   // into the current register buffer.
-  auto fetch_to_registers = [&](int k, int pipe, int pipe_a = 0) {
-    int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe_a;
+  auto fetch_to_registers = [&](int k, int pipe) {
+    int4* sh_a_stage = sh_a + moe_block_size * a_sh_stride * pipe;
   #pragma unroll
     for (int i = 0; i < thread_m_blocks; i++)
-      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+      ldsm<m_block_size_8 ? 2 : 4, a_type_id>(
           frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
 
   #pragma unroll
     for (int i = 0; i < b_thread_vecs; i++) {
       frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+          &sh_b_stage[b_sh_stride * (k % b_sh_wr_iters) + b_sh_rd + i]);
     }
   };
 
@@ -1070,53 +1232,54 @@ __global__ void Marlin(
 
   auto fetch_scales_to_registers = [&](int k, int full_pipe) {
     int pipe = full_pipe % stages;
+    using IT1 = typename std::conditional_t<is_a_8bit, int2, int4>;
+    using IT0 = typename std::conditional_t<is_a_8bit, int, int2>;
+    constexpr int group_blocks2 = div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
     if constexpr (!has_act_order) {
       // No act-order case
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 && dequant_skip_flop) {
           reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
           reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
         }
       } else if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_s_stage =
-                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
-            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-          } else {
-            reinterpret_cast<int4*>(&frag_s[1])[0] =
-                reinterpret_cast<int4*>(&frag_s[0])[0];
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0) {
+            if (k % b_sh_wr_iters == 0) {
+              int4* sh_s_stage = sh_s + s_sh_stage * (g * (pipe / g));
+              reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+            } else {
+              reinterpret_cast<int4*>(&frag_s[1])[0] =
+                  reinterpret_cast<int4*>(&frag_s[0])[0];
+            }
           }
-        } else {
+        } else if (group_blocks2 < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
+          int warp_row = warp_id / tb_n_warps;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id =
-              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+          int cur_group_id = k_blocks / group_blocks2;
 
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
+          } else {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        } else if (group_blocks >= b_sh_wr_iters) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
           } else {
-            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
-                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
-                                k % 2];
+            reinterpret_cast<int2*>(&frag_s[1])[0] =
+                reinterpret_cast<int2*>(&frag_s[0])[0];
           }
         }
       }
@@ -1137,18 +1300,15 @@ __global__ void Marlin(
     cur_k = 0;
 
     // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    cur_k += k % b_sh_wr_iters;
 
     // Determine "position" inside the thread-block (based on warp and
     // thread-id)
     auto warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
+    int warp_row = warp_id / tb_n_warps;
+    int warp_col = warp_id % tb_n_warps;
 
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
-
-    cur_k += warp_row * 16;
+    cur_k += warp_row * 16 * b_sh_wr_iters;
 
     auto th_id = threadIdx.x % 32;
     cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
@@ -1203,18 +1363,16 @@ __global__ void Marlin(
 
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 || is_a_8bit) {
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
           }
         }
-
       } else if constexpr (group_blocks >= thread_k_blocks) {
-        if (k % b_sh_wr_iters == 0) {
-          int4* sh_zp_stage =
-              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
+        constexpr int g = group_blocks / thread_k_blocks;
+        if (pipe % g == 0 && k % b_sh_wr_iters == 0 || is_a_8bit) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] =
@@ -1223,21 +1381,11 @@ __global__ void Marlin(
         }
       } else {
         auto warp_id = threadIdx.x / 32;
-        int n_warps = thread_n_blocks / 4;
-
-        int warp_row = warp_id / n_warps;
 
-        int cur_k = warp_row * 16;
-        cur_k += k_iter_size * (k % b_sh_wr_iters);
+        int warp_row = warp_id / tb_n_warps;
 
-        int k_blocks = cur_k / 16;
-        int cur_group_id = 0;
-
-        // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
-        cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
+        int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+        int cur_group_id = k_blocks / div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
         int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1256,29 +1404,18 @@ __global__ void Marlin(
 
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_zp_stage =
-                sh_zp +
-                zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0 && k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
             reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
                 sh_zp_stage[zp_sh_rd];
           }
-        } else {
+        } else if (group_blocks < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
-
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
 
-          int k_blocks = cur_k / 16;
-          // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
+          int warp_row = warp_id / tb_n_warps;
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
           int cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
 
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1289,33 +1426,46 @@ __global__ void Marlin(
     }
   };
 
-  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
-    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  auto dequant_data = [&](int q, scalar_32bit_t* frag_b_ptr, int zp = 0) {
+    if constexpr (a_type.size_bits() != b_type.size_bits()) {
+      if constexpr (is_a_8bit && has_zp) {
+        sub_zp_and_dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(
+            q, frag_b_ptr, zp);
+      } else {
+        dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(q, frag_b_ptr);
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   bool is_first_matmul_in_slice = true;
-  auto matmul = [&](int k) {
+  auto matmul = [&](int k, int pipe) {
+    if (is_a_8bit) return;
     int k2 = k % 2;
+    constexpr int g =
+        group_blocks > 0 ? div_ceil(group_blocks, thread_k_blocks) : 1;
     const bool is_new_zp =
-        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == 0) ||
+        ((group_blocks > 0) && (group_blocks < b_sh_wr_iters || k == 0)) &&
+            (pipe % g == 0) ||
         (group_blocks == -1 && is_first_matmul_in_slice);
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
         int zp_quant_0, zp_quant_1;
 
-        if constexpr (w_type.size_bits() == 4) {
+        if constexpr (b_type.size_bits() == 4) {
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = zp_quant_0 >> 8;
         } else {
-          static_assert(w_type.size_bits() == 8);
+          static_assert(b_type.size_bits() == 8);
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = frag_qzp[k2][1];
         }
 
-        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
-        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_zp));
+        dequant_data(zp_quant_1,
+                     reinterpret_cast<scalar_32bit_t*>(&frag_zp) + 2);
       }
     }
     if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
@@ -1325,14 +1475,14 @@ __global__ void Marlin(
       }
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (b_type == vllm::kFE2M1f) {
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_1, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]) + 2);
     }
 
   // We have the m dimension as the inner loop in order to encourage overlapping
@@ -1343,61 +1493,168 @@ __global__ void Marlin(
       FragB frag_b1;
       int b_quant_0, b_quant_1;
 
-      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+      if constexpr (b_type_id == vllm::kFE2M1f.id()) {
         b_quant_1 = frag_b_quant[k2][0][j];
         b_quant_0 = b_quant_1 << 8;
-      } else if constexpr (w_type.size_bits() == 4) {
+      } else if constexpr (b_type.size_bits() == 4) {
         b_quant_0 = frag_b_quant[k2][0][j];
         b_quant_1 = b_quant_0 >> 8;
       } else {
-        static_assert(w_type.size_bits() == 8);
+        static_assert(b_type.size_bits() == 8);
         int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
         b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
         b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
       }
 
-      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
-      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+      dequant_data(b_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_32bit_t*>(&frag_b1));
 
-      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
-        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
-        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float && !is_a_8bit) {
+        sub_zp<a_type_id>(frag_b0, frag_zp[j], 0);
+        sub_zp<a_type_id>(frag_b1, frag_zp[j], 1);
       }
 
       // Apply scale to frag_b0
-      if constexpr (has_act_order) {
+      if constexpr (has_act_order && !is_a_8bit) {
         static_assert(group_blocks != -1);
-        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
-        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+        scale4<a_type_id>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<a_type_id>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
       } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
-                           group_blocks == -1) {
+                           group_blocks == -1 && !is_a_8bit) {
         int idx = (threadIdx.x / 4) % 2;
-        scalar_t2 s2 = Dtype::nums2num2(
+        scalar_t2 s2 = Adtype::nums2num2(
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
         if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
-        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
-      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        scale_and_sub<a_type_id>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1 &&
+                           !is_a_8bit) {
         if (is_new_zp)
           frag_zp[j] = __hmul2(frag_zp[j],
                                *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
-        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
-      } else if constexpr (group_blocks != -1) {
-        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
-        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+        scale_and_sub<a_type_id>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1 && !is_a_8bit) {
+        scale<a_type_id>(frag_b0, frag_s[k2][j], 0);
+        scale<a_type_id>(frag_b1, frag_s[k2][j], 1);
       }
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
+                               frag_c[i][j][0]);
         } else {
-          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  auto matmul_a8 = [&](int k) {
+    int k2 = k % 2;
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+      FragB frag_b[2];
+
+      if (is_a_8bit && b_type.size_bits() == 4 && !has_zp) {
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b));
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2);
+      } else if (is_a_8bit && b_type.size_bits() == 4 && has_zp) {
+        int off = (threadIdx.x / 32) % 2 * 2 + j;
+        int zp = (frag_qzp[k2][0] >> (off * 8)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b), zp);
+        zp = (frag_qzp[k2][0] >> (off * 8 + 4)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2, zp);
+      } else {
+        reinterpret_cast<int2*>(&frag_b)[0] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[0];
+        reinterpret_cast<int2*>(&frag_b)[1] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[1];
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+      }
+
+      if constexpr (group_blocks != -1) {
+        if (group_blocks == 2 || k == 1) {
+          if constexpr (a_type == vllm::kS8) {
+            int2 s_vals[2];
+            s_vals[0] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[1]};
+            s_vals[1] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[1]};
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[0])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][0][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][0][g]) *
+                    scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[1])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][1][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][1][g]) *
+                    scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          } else {
+            float2 s_vals[2];
+            if constexpr (s_type_id != vllm::kFE8M0fnu.id()) {
+              static_assert(a_type.size_bits() == 16 ||
+                            s_type.size_bits() == 16);
+              s_vals[0] = Cdtype::num22float2(frag_s[k2][j * 2][0]);
+              s_vals[1] = Cdtype::num22float2(frag_s[k2][j * 2 + 1][0]);
+            } else {
+              int32_t* s_vals_int = reinterpret_cast<int32_t*>(&s_vals[0]);
+              int32_t s_vals_e8m0 =
+                  *reinterpret_cast<int32_t*>(&frag_s[k2][j][0]);
+
+              s_vals_int[0] = (s_vals_e8m0 & 0xFF) << 23;
+              s_vals_int[1] = (s_vals_e8m0 & 0xFF00) << 15;
+              s_vals_int[2] = (s_vals_e8m0 & 0xFF0000) << 7;
+              s_vals_int[3] = (s_vals_e8m0 & 0xFF000000) >> 1;
+            }
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[0])[g % 2];
+                frag_c[i][j][0][g] += frag_c_tmp[i][j][0][g] * scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[1])[g % 2];
+                frag_c[i][j][1][g] += frag_c_tmp[i][j][1][g] * scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          }
         }
       }
     }
@@ -1411,7 +1668,8 @@ __global__ void Marlin(
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
       auto red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_stride =
+          b_sh_stride_threads * (is_a_8bit ? 2 : 4) * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                       (threadIdx.x % b_sh_stride_threads);
@@ -1426,7 +1684,8 @@ __global__ void Marlin(
         for (int i = red_off; i > 0; i /= 2) {
           if (i <= red_idx && red_idx < 2 * i) {
   #pragma unroll
-            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+            for (int j = 0; j < (is_a_8bit ? 2 : 4) * 2;
+                 j += (m_block_size_8 ? 2 : 1)) {
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
@@ -1435,24 +1694,26 @@ __global__ void Marlin(
                 float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                  reinterpret_cast<FragC*>(
+                      frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh_red[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j];
             }
           }
           __syncthreads();
         }
         if (red_idx == 0) {
   #pragma unroll
-          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+          for (int i = 0; i < (is_a_8bit ? 2 : 4) * 2;
+               i += (m_block_size_8 ? 2 : 1)) {
             float* c_rd =
                 reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
+              reinterpret_cast<FragC*>(
+                  frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + i][j] += c_rd[j];
           }
         }
         __syncthreads();
@@ -1468,13 +1729,13 @@ __global__ void Marlin(
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
     if (!is_th_active) {
       return;
     }
 
-    int c_gl_stride = prob_n / 8;
+    int c_gl_stride = prob_n / 8 * (is_a_8bit ? 2 : 1);
     int c_gl_wr_delta_o = 8 * c_gl_stride;
     int c_gl_wr_delta_i = 4 * (active_threads / 32);
     int c_gl_wr;
@@ -1485,7 +1746,7 @@ __global__ void Marlin(
     } else {
       c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
                 4 * (threadIdx.x / 32) + threadIdx.x % 4;
-      c_gl_wr += (2 * thread_n_blocks) * slice_col;
+      c_gl_wr += (2 * thread_n_blocks) * slice_col * (is_a_8bit ? 2 : 1);
     }
     constexpr int c_sh_wr_delta = active_threads;
     int c_sh_wr = threadIdx.x;
@@ -1504,7 +1765,13 @@ __global__ void Marlin(
         if (c_idx / c_gl_stride < block_num_valid_tokens) {
           int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
           int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
-          sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+          if constexpr (is_a_8bit) {
+            int2* sh_red_int2 = reinterpret_cast<int2*>(sh_red);
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            sh_red_int2[c_sh_wr + c_sh_wr_delta * i] = c_int2[true_idx];
+          } else {
+            sh_red[c_sh_wr + c_sh_wr_delta * i] = C[true_idx];
+          }
         }
       }
     }
@@ -1512,29 +1779,37 @@ __global__ void Marlin(
   #pragma unroll
     for (int i = 0; i < (m_block_size_8 ? 2 : thread_m_blocks * 4); i++) {
       if (!first) {
-        int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+        c_scalar_t* c_red_f16;
+        if constexpr (is_a_8bit) {
+          int2 tmp =
+              reinterpret_cast<int2*>(sh_red)[c_sh_wr + i * c_sh_wr_delta];
+          c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+        } else {
+          int4 tmp = sh_red[c_sh_wr + i * c_sh_wr_delta];
+          c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+        }
   #pragma unroll
-        for (int j = 0; j < 2 * 4; j++) {
+        for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
           int delta = 0;
           if constexpr (m_block_size_8) {
             delta = j % 2 == 1 ? -2 : 0;
           }
           reinterpret_cast<float*>(
-              &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
-              Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+              &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j + (i % 4) +
+                       delta] += Cdtype::num2float(c_red_f16[j]);
         }
       }
       if (!last) {
-        int4 c;
+        c_scalar_t c_f16[is_a_8bit ? 4 : 8];
   #pragma unroll
-        for (int j = 0; j < 2 * 4; j++) {
+        for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
           int delta = 0;
           if constexpr (m_block_size_8) {
             delta = j % 2 == 1 ? -2 : 0;
           }
-          reinterpret_cast<scalar_t*>(&c)[j] =
-              Dtype::float2num(reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+          c_f16[j] = Cdtype::float2num(reinterpret_cast<float*>(
+              &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j + (i % 4) +
+                       delta]);
         }
 
         int c_idx;
@@ -1547,7 +1822,12 @@ __global__ void Marlin(
         if (c_idx / c_gl_stride < block_num_valid_tokens) {
           int64_t sorted_row = sh_block_sorted_ids[c_idx / c_gl_stride];
           int64_t true_idx = sorted_row * c_gl_stride + c_idx % c_gl_stride;
-          C[true_idx] = c;
+          if constexpr (is_a_8bit) {
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            c_int2[true_idx] = *reinterpret_cast<int2*>(c_f16);
+          } else {
+            C[true_idx] = *reinterpret_cast<int4*>(c_f16);
+          }
         }
       }
     }
@@ -1561,10 +1841,10 @@ __global__ void Marlin(
 
     constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
 
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
 
-    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int num_floats = thread_m_blocks * (is_a_8bit ? 2 : 4) * 2 * 4;
     constexpr int th_size = num_floats * sizeof(float) / 16;
 
     int c_cur_offset = locks_off * c_size;
@@ -1632,7 +1912,7 @@ __global__ void Marlin(
     } else {
       c_sh_wr =
           (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-      c_sh_wr += 32 * (threadIdx.x / 32);
+      c_sh_wr += (is_a_8bit ? 16 : 32) * (threadIdx.x / 32);
     }
 
     int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
@@ -1641,49 +1921,49 @@ __global__ void Marlin(
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
     auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
-      scalar_t2 res =
-          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+      c_scalar_t2 res =
+          Cdtype::nums2num2(Cdtype::float2num(c0), Cdtype::float2num(c1));
 
       // For per-column quantization we finally apply the scale here (only for
       // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4 &&
+      if constexpr (!has_act_order && group_blocks == -1 && !is_a_8bit &&
+                    b_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        scalar_t2 tmp_scale = s[0];
+        c_scalar_t2 tmp_scale = s[0];
         if constexpr (m_block_size_8) {
-          tmp_scale = Dtype::num2num2(
+          tmp_scale = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         if (!mul_topk_weights) {
           res = __hmul2(res, global_scale);
         }
       }
       if (has_bias && last) {
-        scalar_t2 tmp_bias = b_bias[0];
+        c_scalar_t2 tmp_bias = b_bias[0];
         if constexpr (m_block_size_8) {
-          tmp_bias = Dtype::num2num2(
+          tmp_bias = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hadd2(res, tmp_bias);
       }
 
       if constexpr (m_block_size_8) {
-        ((scalar_t*)sh_red)[idx] = res.x;
-        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+        ((c_scalar_t*)sh_red)[idx] = res.x;
+        ((c_scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
       } else {
-        ((scalar_t2*)sh_red)[idx] = res;
+        ((c_scalar_t2*)sh_red)[idx] = res;
       }
     };
 
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+    if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
-        for (int j = 0; j < 4; j++) {
+        for (int j = 0; j < (is_a_8bit ? 2 : 4); j++) {
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
@@ -1721,24 +2001,26 @@ __global__ void Marlin(
       if (row < block_num_valid_tokens) {
         int64_t sorted_row = sh_block_sorted_ids[row];
         int64_t true_idx = sorted_row * c_gl_stride + c_gl_wr % c_gl_stride;
-        scalar_t2 topk_weight_score;
+        c_scalar_t2 topk_weight_score;
         if (mul_topk_weights) topk_weight_score = sh_block_topk_weights[row];
         if (use_atomic_add && slice_count > 1 || mul_topk_weights) {
-          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[true_idx]);
-          scalar_t2* sh_red_half2 =
-              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+          c_scalar_t2* C_half2 = reinterpret_cast<c_scalar_t2*>(&C[true_idx]);
+          c_scalar_t2* sh_red_half2 =
+              reinterpret_cast<c_scalar_t2*>(&sh_red[c_sh_rd]);
+          if (mul_topk_weights) {
   #pragma unroll
-          for (int a = 0; a < 4; a++) {
-            scalar_t2 res = sh_red_half2[a];
-            if (mul_topk_weights) {
-              res = __hmul2(res, topk_weight_score);
+            for (int a = 0; a < 4; a++) {
+              sh_red_half2[a] = __hmul2(sh_red_half2[a], topk_weight_score);
             }
+          }
 
-            if (use_atomic_add && slice_count > 1) {
-              atomicAdd(&C_half2[a], res);
-            } else {
-              C_half2[a] = res;
-            };
+          if (use_atomic_add && slice_count > 1) {
+  #pragma unroll
+            for (int a = 0; a < 4; a++) {
+              atomicAdd(&C_half2[a], sh_red_half2[a]);
+            }
+          } else {
+            C[true_idx] = *reinterpret_cast<int4*>(sh_red_half2);
           }
         } else {
           C[true_idx] = sh_red[c_sh_rd];
@@ -1772,7 +2054,7 @@ __global__ void Marlin(
           }
         }
       }
-      fetch_to_shared(i, i, i < slice_iters, i);
+      fetch_to_shared(i, i, i < slice_iters);
     }
 
     zero_accums();
@@ -1797,73 +2079,100 @@ __global__ void Marlin(
     // have even length meaning that the next iteration will always start at
     // index 0.
 
-    for (int stage_group_id = 0; stage_group_id < max_num_stage_groups;
-         stage_group_id++) {
   #pragma unroll
-      for (int pipe = 0; pipe < stages;) {
+    for (int pipe = 0; pipe < stages;) {
   #pragma unroll
-        for (int k = 0; k < b_sh_wr_iters; k++) {
-          int idx =
-              (pipe >= stages && stage_group_id == max_num_stage_groups - 1)
-                  ? (pipe - stages)
-                  : (pipe + stage_group_id * stages);
-          fetch_to_registers(k + 1, pipe % stages, idx);
-          fetch_scales_to_registers(k + 1, pipe);
-          fetch_zp_to_registers(k + 1, pipe);
-          if (k == b_sh_wr_iters - 2) {
-            int idx = (pipe >= 1 && stage_group_id == max_num_stage_groups - 1)
-                          ? (pipe - 1)
-                          : (pipe + (stage_group_id + 1) * stages - 1);
-            fetch_to_shared((pipe + stages - 1) % stages, pipe,
-                            slice_iters >= stages, idx);
-            pipe++;
-            wait_for_stage();
-            init_same_group(pipe % stages);
-          }
-          matmul(k);
+      for (int k = 0; k < b_sh_wr_iters; k++) {
+        fetch_to_registers(k + 1, pipe % stages);
+        fetch_scales_to_registers(k + 1, pipe);
+        fetch_zp_to_registers(k + 1, pipe);
+        if (k == b_sh_wr_iters - 2) {
+          fetch_to_shared((pipe + stages - 1) % stages, pipe,
+                          slice_iters >= stages);
+          pipe++;
+          wait_for_stage();
+          init_same_group(pipe % stages);
         }
-        slice_iters--;
-        if (slice_iters == 0) {
-          break;
-        }
-      }
-
-      a_gl_rd_col += a_gl_rd_delta_o * stages;
 
-      if constexpr (has_act_order) {
-        slice_k_start += tb_k * stages;
-
-        if (slice_k_start < prob_k) {
-          slice_k_start_shared_fetch += tb_k * stages;
-          int first_group_id = g_idx[slice_k_start];
-          int last_g_idx = slice_k_start + stages * tb_k * 2;
-          if (last_g_idx >= prob_k) {
-            last_g_idx = prob_k - 1;
-          }
-          int last_group_id = g_idx[last_g_idx];
-          if (last_group_id >= sh_first_group_id + sh_num_groups) {
-            fetch_act_order_scales_to_shared(false, first_group_id,
-                                             last_group_id);
-            __syncthreads();
-          }
+        if constexpr (!is_a_8bit) {
+          matmul(k, pipe - (k >= b_sh_wr_iters - 2 ? 1 : 0));
+        } else {
+          static_assert(group_blocks != 0 && group_blocks != 1);
+          matmul_a8(k);
         }
       }
+      slice_iters--;
       if (slice_iters == 0) {
         break;
       }
     }
 
+    a_gl_rd_col += a_gl_rd_delta_o * stages;
+
+    if constexpr (has_act_order) {
+      slice_k_start += tb_k * stages;
+
+      if (slice_k_start < prob_k) {
+        slice_k_start_shared_fetch += tb_k * stages;
+        int first_group_id = g_idx[slice_k_start];
+        int last_g_idx = slice_k_start + stages * tb_k * 2;
+        if (last_g_idx >= prob_k) {
+          last_g_idx = prob_k - 1;
+        }
+        int last_group_id = g_idx[last_g_idx];
+        if (last_group_id >= sh_first_group_id + sh_num_groups) {
+          fetch_act_order_scales_to_shared(false, first_group_id,
+                                           last_group_id);
+          __syncthreads();
+        }
+      }
+    }
+
     // Process results and, if necessary, proceed to the next column slice.
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      if constexpr (is_a_8bit) {
+        float frag_a_s[2 * thread_m_blocks];
+
+        for (int i = 0; i < 2 * thread_m_blocks; i++)
+          frag_a_s[i] = sh_a_s[i * 8 + (threadIdx.x % 32) / 4];
+
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][0][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][0][g] = c_val * s_val;
+            }
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][1][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][1][g] = c_val * s_val;
+            }
+          }
+        }
+      }
+
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+        if (b_type.size_bits() == 8 || (last || use_atomic_add) || is_a_8bit) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
@@ -1881,20 +2190,27 @@ __global__ void Marlin(
       }
 
       if constexpr (!has_act_order && group_blocks == -1 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+                    (has_zp && dequant_skip_flop || !has_zp || is_a_8bit)) {
+        if constexpr (is_a_8bit) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < tb_n_warps) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          }
+        } else if (b_type.size_bits() == 8 || (last || use_atomic_add)) {
           cp_async_wait<0>();
           __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          if (threadIdx.x / 32 < tb_n_warps) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
             if constexpr (m_block_size_8) {
               int idx = (threadIdx.x / 4) % 2;
-              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+              c_scalar_t2* frag_s_half2 =
+                  reinterpret_cast<c_scalar_t2*>(frag_s);
   #pragma unroll
               for (int i = 0; i < 8; i++) {
-                frag_s_half2[i] = Dtype::num2num2(
-                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+                frag_s_half2[i] = Cdtype::num2num2(
+                    reinterpret_cast<c_scalar_t*>(&frag_s_half2[i])[idx]);
               }
             }
           }
@@ -1904,26 +2220,48 @@ __global__ void Marlin(
       // For 8-bit channelwise, we apply the scale before the global reduction
       // that converts the fp32 results to fp16 (so that we avoid possible
       // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+      if constexpr (!has_act_order && group_blocks == -1 && is_a_8bit) {
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+          float2 aa[2];
+          aa[0] = Cdtype::num22float2(frag_s[0][j * 2][0]);
+          aa[1] = Cdtype::num22float2(frag_s[0][j * 2 + 1][0]);
+
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[0])[g % 2];
+              frag_c[i][j][0][g] *= scale;
+            }
+
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[1])[g % 2];
+              frag_c[i][j][1][g] *= scale;
+            }
+          }
+        }
+      } else if (!has_act_order && group_blocks == -1 &&
+                 b_type.size_bits() == 8 &&
+                 (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
           for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
             for (int j = 0; j < 4; j++) {
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                   frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                   frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
 
               if constexpr (!m_block_size_8) {
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
               }
@@ -1947,7 +2285,8 @@ __global__ void Marlin(
         cp_async_wait<0>();
         __syncthreads();
         reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
-        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        if constexpr (!is_a_8bit)
+          reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
         __syncthreads();
       }
 
@@ -1956,37 +2295,22 @@ __global__ void Marlin(
       if (last || use_atomic_add)
         // only the last block in a slice actually writes the result
         write_result(last);
-      int old_slice_row = slice_row;
       slice_row = 0;
-      slice_col_par++;
-      slice_col++;
-      is_first_matmul_in_slice = true;
-      init_slice();
-
-      // Should we load A matrix in next slice?
-      // `slice_col == 0`: when move to a new moe block
-      // `old_slice_row > 0`:
-      //    when the last slice is not starting from k_index == 0
-      //    (only happen when it is the first slice of a threadblock)
-      // `prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups`:
-      //    when the required shared memory size is larger than
-      //    the remaining shared memory
-      if (slice_col == 0 || old_slice_row ||
-          prob_k > thread_k_blocks * 16 * stages * max_num_stage_groups) {
-        should_load_a = true;
+      if (!in_part2) {
+        slice_col_par += gridDim.x;
       } else {
-        should_load_a = false;
+        slice_col_par++;
+        slice_col++;
       }
+      is_first_matmul_in_slice = true;
+      init_slice();
 
       if (slice_iters) {
-        a_gl_rd_col = (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
+        a_gl_rd_col =
+            a_gl_rd_delta_o * slice_row + threadIdx.x % a_gl_rd_delta_o;
+        b_gl_rd = B_expert_off + b_gl_stride * (threadIdx.x / b_sh_stride) +
+                  (threadIdx.x % b_sh_stride);
+        b_gl_rd += b_sh_stride * slice_col + b_gl_rd_delta_o * slice_row;
 
         bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
@@ -1996,8 +2320,26 @@ __global__ void Marlin(
           slice_k_start_shared_fetch = slice_k_start;
           slice_n_offset = act_s_col_tb_stride * slice_col;
         } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          if constexpr (group_blocks == -1) {
+            s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          } else if constexpr (group_blocks >= thread_k_blocks) {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                zp_sh_stride * slice_col + threadIdx.x;
+          } else {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                threadIdx.x / zp_sh_stride) +
+                zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
+          }
         }
         start_pipes();
       }
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
index 601e2aa6f991..27b6ffaa6717 100644
--- a/csrc/moe/marlin_moe_wna16/ops.cu
+++ b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -37,39 +37,6 @@ __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS){};
 
 using MarlinFuncPtr = void (*)(MARLIN_KERNEL_PARAMS);
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-
-template <int moe_block_size>
-__global__ void permute_cols_kernel(
-    int4 const* __restrict__ a_int4_ptr, int const* __restrict__ perm_int_ptr,
-    int4* __restrict__ out_int4_ptr,
-    const int32_t* __restrict__ sorted_token_ids_ptr,
-    const int32_t* __restrict__ expert_ids_ptr,
-    const int32_t* __restrict__ num_tokens_past_padded_ptr, int size_m,
-    int size_k, int top_k) {};
-
-}  // namespace marlin
-
-torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
-    torch::Tensor& b_q_weight,
-    std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
-    std::optional<torch::Tensor> const& b_zeros_or_none,
-    std::optional<torch::Tensor> const& g_idx_or_none,
-    std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
-    torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
-    int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
-    int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  TORCH_CHECK_NOT_IMPLEMENTED(false,
-                              "marlin_gemm(..) requires CUDA_ARCH >= 8.0");
-  return torch::empty({1, 1});
-}
-
-#else
-
 // For a given "a" of size [M,K] performs a permutation of the K columns based
 // on the given "perm" indices.
 template <int moe_block_size>
@@ -207,7 +174,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
                           int thread_m_blocks, int prob_m, int prob_n,
                           int prob_k, int num_bits, int group_size,
                           bool has_act_order, bool is_k_full, int has_zp,
-                          int is_zp_float) {
+                          int is_zp_float, bool is_a_8bit) {
   int pack_factor = 32 / num_bits;
 
   // Get B size
@@ -217,8 +184,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
 
   // shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
   // both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
-  int sh_block_meta_size = tb_m * 4;
-  int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
+  int sh_block_meta_size = tb_m * 16;
+  int sh_a_size = pipe_stages * (tb_m * tb_k) * (is_a_8bit ? 1 : 2);
   int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
   int sh_red_size = tb_m * (tb_n + 8) * 2;
   int sh_bias_size = tb_n * 2;
@@ -250,7 +217,7 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
                      int thread_m_blocks, int prob_m, int prob_n, int prob_k,
                      int num_bits, int group_size, bool has_act_order,
                      bool is_k_full, int has_zp, int is_zp_float,
-                     int max_shared_mem) {
+                     int max_shared_mem, bool is_a_8bit) {
   // Sanity
   if (th_config.thread_k == -1 || th_config.thread_n == -1 ||
       th_config.num_threads == -1) {
@@ -273,188 +240,34 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
   }
 
   // Check that pipeline fits into cache
-  int cache_size = get_kernel_cache_size(
-      th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-      num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  int cache_size =
+      get_kernel_cache_size(th_config, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+  return cache_size <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
-    }
-
-  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
-  //         this is the most common cases
-  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
-  // FZP: cases for float-zero-point (is_zp_float = true)
-  // ACT: cases for act order case (group_blocks == 0)
-  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
-  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF(W_TYPE)            \
-    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF(W_TYPE)            \
-    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF(W_TYPE)            \
-    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)
-
-template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
-  if (false) {
-  }
-
-  COMMON_GET_IF(vllm::kU4)
-  COMMON_GET_IF(vllm::kU4B8)
-  COMMON_GET_IF(vllm::kU8B128)
 
-  NVFP4_GET_IF(vllm::kFE2M1f)
-
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
-
-  ACT_GET_IF(vllm::kU4B8)
-  ACT_GET_IF(vllm::kU8B128)
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }
+#include "kernel_selector.h"
 
   return kernel;
 }
 
-template <typename scalar_t>
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
-                                    int prob_n, int prob_k, int thread_m_blocks,
-                                    bool m_block_size_8, int num_bits,
-                                    int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
-                                    bool is_zp_float, int max_shared_mem) {
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int num_experts, int top_k, int thread_m_blocks,
+    bool m_block_size_8, int num_bits, int group_size, bool has_act_order,
+    bool is_k_full, bool has_zp, bool is_zp_float, int max_shared_mem, int sms,
+    bool is_a_8bit) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -471,73 +284,69 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 
     if (!is_valid_config(th_config, m_block_size_8, thread_m_blocks, prob_m,
                          prob_n, prob_k, num_bits, group_size, has_act_order,
-                         is_k_full, has_zp, is_zp_float, max_shared_mem)) {
+                         is_k_full, has_zp, is_zp_float, max_shared_mem - 512,
+                         is_a_8bit)) {
       continue;
     }
 
     int cache_size = get_kernel_cache_size(
         th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
+        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
+        is_a_8bit);
 
     int group_blocks = 0;
     if (!has_act_order) {
       group_blocks = group_size == -1 ? -1 : (group_size / 16);
     }
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, th_config.thread_n / 16,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
-        group_blocks, th_config.num_threads, is_zp_float);
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) continue;
 
-    if (thread_m_blocks > 1) {
-      exec_cfg = {1, th_config};
-      break;
-    } else {
-      cudaFuncAttributes attr;
-      cudaFuncGetAttributes(&attr, kernel);
-      int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
-      int allow_count = min(device_max_reg_size / reg_size,
-                            max_shared_mem / (cache_size + 1024));
+    cudaFuncAttributes attr;
+    cudaFuncGetAttributes(&attr, kernel);
+    int reg_size = max(attr.numRegs, 1) * th_config.num_threads * 4;
+    int allow_count = min(device_max_reg_size / reg_size,
+                          max_shared_mem / (cache_size + 1536));
+    if (thread_m_blocks == 1)
       allow_count = max(min(allow_count, 4), 1);
-      if (allow_count > count) {
-        count = allow_count;
-        exec_cfg = {count, th_config};
-      };
+    else
+      allow_count = max(min(allow_count, 2), 1);
+
+    if (prob_n / th_config.thread_n * prob_m * top_k * 4 < sms * allow_count) {
+      allow_count =
+          max(prob_n / th_config.thread_n * prob_m * top_k * 4 / sms, 1);
     }
+
+    if (allow_count > count) {
+      count = allow_count;
+      exec_cfg = {count, th_config};
+    };
   }
 
   return exec_cfg;
 }
 
-template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, void* sorted_token_ids, void* expert_ids,
-               void* num_tokens_past_padded, void* topk_weights,
-               int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
-               int prob_m, int prob_n, int prob_k, void* workspace,
-               vllm::ScalarType const& q_type, bool has_bias,
-               bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
-               int group_size, int dev, cudaStream_t stream, int thread_k,
-               int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
-               bool is_zp_float) {
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, void* sorted_token_ids,
+               void* expert_ids, void* num_tokens_past_padded,
+               void* topk_weights, int moe_block_size, int num_experts,
+               int top_k, bool mul_topk_weights, bool is_ep, int prob_m,
+               int prob_n, int prob_k, void* workspace,
+               vllm::ScalarType const& a_type, vllm::ScalarType const& b_type,
+               vllm::ScalarType const& c_type, vllm::ScalarType const& s_type,
+               bool has_bias, bool has_act_order, bool is_k_full, bool has_zp,
+               int num_groups, int group_size, int dev, cudaStream_t stream,
+               int thread_k, int thread_n, int sms, int blocks_per_sm,
+               bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) {
   int thread_m_blocks = div_ceil(moe_block_size, 16);
   bool m_block_size_8 = moe_block_size == 8;
-
-  if (has_zp) {
-    TORCH_CHECK(
-        q_type == vllm::kU4 || q_type == vllm::kU8,
-        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
-  } else {
-    TORCH_CHECK(
-        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
-            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
-        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
-        "has_zp = False. Got = ",
-        q_type.str());
-  }
+  bool is_a_8bit = a_type.size_bits() == 8;
 
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
@@ -563,14 +372,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     }
   }
 
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
   const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
   const int* perm_ptr = (const int*)perm;
@@ -618,22 +428,41 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   TORCH_CHECK(max_shared_mem > 0);
 
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(major_capability * 10 + minor_capability >= 89,
+                "FP8 only support Ada Lovelace or newer GPUs.");
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
   // Set thread config
   exec_config_t exec_cfg;
   thread_config_t thread_tfg;
   if (thread_k != -1 && thread_n != -1) {
-    thread_tfg = thread_config_t{thread_k, thread_n, default_threads};
-    exec_cfg = exec_config_t{1, thread_tfg};
+    thread_tfg = thread_config_t{thread_k, thread_n, thread_k * thread_n / 64};
+    if (blocks_per_sm == -1) blocks_per_sm = 1;
+    exec_cfg = exec_config_t{blocks_per_sm, thread_tfg};
     TORCH_CHECK(prob_n % thread_n == 0, "prob_n = ", prob_n,
                 " is not divisible by thread_n = ", thread_n);
     TORCH_CHECK(prob_k % thread_k == 0, "prob_k = ", prob_k,
                 " is not divisible by thread_k = ", thread_k);
   } else {
     // Auto config
-    exec_cfg = determine_exec_config<scalar_t>(
-        q_type, prob_m, prob_n, prob_k, thread_m_blocks, m_block_size_8,
-        num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-        max_shared_mem);
+    exec_cfg = determine_exec_config(
+        a_type, b_type, c_type, s_type, prob_m, prob_n, prob_k, num_experts,
+        top_k, thread_m_blocks, m_block_size_8, num_bits, group_size,
+        has_act_order, is_k_full, has_zp, is_zp_float, max_shared_mem, sms,
+        is_a_8bit);
     thread_tfg = exec_cfg.tb_cfg;
   }
 
@@ -647,22 +476,29 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   int thread_k_blocks = thread_k / 16;
   int thread_n_blocks = thread_n / 16;
 
-  TORCH_CHECK(
-      is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
-                      prob_n, prob_k, num_bits, group_size, has_act_order,
-                      is_k_full, has_zp, is_zp_float, max_shared_mem),
-      "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
-      ", thread_k = ", thread_tfg.thread_k,
-      ", thread_n = ", thread_tfg.thread_n,
-      ", num_threads = ", thread_tfg.num_threads, " for MKN = [", prob_m, ", ",
-      prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
-      ", group_size = ", group_size, ", has_act_order = ", has_act_order,
-      ", is_k_full = ", is_k_full, ", has_zp = ", has_zp,
-      ", is_zp_float = ", is_zp_float, ", max_shared_mem = ", max_shared_mem);
-
-  auto kernel = get_marlin_kernel<scalar_t>(
-      q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks, m_block_size_8,
-      has_act_order, has_zp, group_blocks, num_threads, is_zp_float);
+  TORCH_CHECK(is_valid_config(thread_tfg, m_block_size_8, thread_m_blocks,
+                              prob_m, prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem, is_a_8bit),
+              "Invalid thread config: thread_m_blocks = ", thread_m_blocks,
+              ", thread_k = ", thread_tfg.thread_k,
+              ", thread_n = ", thread_tfg.thread_n,
+              ", num_threads = ", thread_tfg.num_threads, " for MKN = [",
+              prob_m, ", ", prob_k, ", ", prob_n, "] and num_bits = ", num_bits,
+              ", group_size = ", group_size,
+              ", has_act_order = ", has_act_order, ", is_k_full = ", is_k_full,
+              ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
+              ", max_shared_mem = ", max_shared_mem);
+
+  int sh_cache_size =
+      get_kernel_cache_size(thread_tfg, m_block_size_8, thread_m_blocks, prob_m,
+                            prob_n, prob_k, num_bits, group_size, has_act_order,
+                            is_k_full, has_zp, is_zp_float, is_a_8bit);
+
+  auto kernel = get_marlin_kernel(
+      a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+      thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+      num_threads, is_zp_float);
 
   if (kernel == MarlinDefault) {
     TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -679,19 +515,20 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
   // avoid ">>>" being formatted to "> > >"
   // clang-format off
   kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
-      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
+      A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr, g_idx_ptr,
       sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
       topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
-      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
+      prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce);
   // clang-format on
 }
 
 }  // namespace MARLIN_NAMESPACE_NAME
 
 torch::Tensor moe_wna16_marlin_gemm(
-    torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
+    torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
     torch::Tensor& b_q_weight,
     std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
@@ -699,11 +536,70 @@ torch::Tensor moe_wna16_marlin_gemm(
     torch::Tensor& sorted_token_ids, torch::Tensor& expert_ids,
     torch::Tensor& num_tokens_past_padded, torch::Tensor& topk_weights,
     int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
-    bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
-  int pack_factor = 32 / b_q_type.size_bits();
+    bool is_zp_float, int64_t thread_k, int64_t thread_n,
+    int64_t blocks_per_sm) {
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
+  int num_experts = b_q_weight.size(0);
 
   if (moe_block_size != 8) {
     TORCH_CHECK(moe_block_size % 16 == 0,
@@ -745,19 +641,27 @@ torch::Tensor moe_wna16_marlin_gemm(
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
 
-  // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_k = -1;
-  // thread_n: `n` size of a thread_tile in `weights` (can usually be left as
-  // auto -1)
-  int thread_n = -1;
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
   // sms: number of SMs to use for the kernel
   int sms = -1;
   cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, a.get_device());
 
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   torch::Tensor c;
   if (c_or_none.has_value()) {
     c = c_or_none.value();
@@ -774,8 +678,6 @@ torch::Tensor moe_wna16_marlin_gemm(
 
   // Alloc C tmp buffer that is going to be used for the global reduce
   torch::Tensor c_tmp;
-  auto options_fp32 =
-      torch::TensorOptions().dtype(at::kFloat).device(a.device());
   if (use_fp32_reduce && !use_atomic_add) {
     // max num of threadblocks is sms * 4
     long max_c_tmp_size = min(
@@ -846,11 +748,11 @@ torch::Tensor moe_wna16_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                 "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                 "the global_scale parameter must be passed for nvfp4 format.");
   }
 
@@ -877,15 +779,15 @@ torch::Tensor moe_wna16_marlin_gemm(
   bool has_zp = b_zeros.size(-1) > 0;
   if (has_zp) {
     TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
   } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
-                "float4_e2m1f when "
-                "has_zp = False. Got = ",
-                b_q_type.str());
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
   }
 
   if (has_zp && is_zp_float) {
@@ -929,71 +831,33 @@ torch::Tensor moe_wna16_marlin_gemm(
               " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::Half>();
-    }
-
-    MARLIN_NAMESPACE_NAME::marlin_mm<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::BFloat16>();
-    }
 
-    MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
-        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
-        num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
-        moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
-        workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
-        has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else {
-    TORCH_CHECK(false,
-                "moe_wna16_marlin_gemm only supports bfloat16 and float16");
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
   }
 
+  MARLIN_NAMESPACE_NAME::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), sorted_token_ids.data_ptr(),
+      expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
+      topk_weights.data_ptr(), moe_block_size, num_experts, top_k,
+      mul_topk_weights, is_ep, size_m, size_n, size_k, workspace.data_ptr(),
+      a_type, b_type, c_type, s_type, has_bias, has_act_order, is_k_full,
+      has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
+      thread_k, thread_n, sms, blocks_per_sm, use_atomic_add, use_fp32_reduce,
+      is_zp_float);
+
   return c;
 }
 
-#endif
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_wna16_marlin_gemm", &moe_wna16_marlin_gemm);
 }
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index bd95ade40a08..e0a8280722f3 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -63,16 +63,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
   m.def(
       "moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
       "Tensor! b_q_weight, Tensor? b_bias_or_none,"
-      "Tensor! b_scales, Tensor? global_scale, Tensor? "
+      "Tensor! b_scales, Tensor? a_scales, Tensor? global_scale, Tensor? "
       "b_zeros_or_none,"
       "Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
       "Tensor sorted_token_ids,"
       "Tensor! expert_ids, Tensor! num_tokens_past_padded,"
       "Tensor! topk_weights, int moe_block_size, int top_k, "
-      "bool mul_topk_weights, bool is_ep, int b_q_type_id,"
+      "bool mul_topk_weights, bool is_ep, int b_type_id,"
       "int size_m, int size_n, int size_k,"
       "bool is_full_k, bool use_atomic_add,"
-      "bool use_fp32_reduce, bool is_zp_float) -> Tensor");
+      "bool use_fp32_reduce, bool is_zp_float,"
+      "int thread_k, int thread_n, int blocks_per_sm) -> Tensor");
+
   m.def(
       "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
       "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! "
diff --git a/csrc/ops.h b/csrc/ops.h
index f8bdc61aaa8e..4bb7857b1503 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -52,14 +52,13 @@ void paged_attention_v2(
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
     const int64_t blocksparse_head_sliding_step);
 
-#ifndef USE_ROCM
 void merge_attn_states(torch::Tensor& output,
                        std::optional<torch::Tensor> output_lse,
                        const torch::Tensor& prefix_output,
                        const torch::Tensor& prefix_lse,
                        const torch::Tensor& suffix_output,
                        const torch::Tensor& suffix_lse);
-
+#ifndef USE_ROCM
 void convert_vertical_slash_indexes(
     torch::Tensor& block_count,      // [BATCH, N_HEADS, NUM_ROWS]
     torch::Tensor& block_offset,     // [BATCH, N_HEADS, NUM_ROWS, NNZ_S]
diff --git a/csrc/quantization/activation_kernels.cu b/csrc/quantization/activation_kernels.cu
index 2521b2797e2c..0c3bcf3b64b2 100644
--- a/csrc/quantization/activation_kernels.cu
+++ b/csrc/quantization/activation_kernels.cu
@@ -279,17 +279,17 @@ __device__ __forceinline__ void token_bounds(int32_t n_tokens,
 }
 
 template <int BLOCK_COUNT, int SMEM_SIZE_BYTES_Y, typename fp8_type,
-          int THREADS, typename Idx_t, bool USE_UE8M0, int GROUP_SIZE = 128,
-          int NUM_STAGES = 3>
+          typename scale_t, int THREADS, typename Idx_t, bool CEIL_UE8M0,
+          int GROUP_SIZE = 128, int NUM_STAGES = 3>
 __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
     const __nv_bfloat16* __restrict__ _input, fp8_type* __restrict__ _y_q,
-    float* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
+    scale_t* __restrict__ _y_s, const int32_t* __restrict__ tokens_per_expert,
     // sizes
     Idx_t E, Idx_t T, Idx_t H,
     // strides (in elements)
     Idx_t stride_i_e, Idx_t stride_i_t, Idx_t stride_i_h, Idx_t stride_yq_e,
     Idx_t stride_yq_t, Idx_t stride_yq_h, Idx_t stride_ys_e, Idx_t stride_ys_t,
-    Idx_t stride_ys_g, Idx_t stride_counts_e) {
+    Idx_t stride_ys_g, Idx_t stride_ys_p, Idx_t stride_counts_e) {
 #ifndef USE_ROCM
   static constexpr int NUM_WARPS = THREADS / WARP_SIZE;
 
@@ -466,9 +466,22 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
 
   __nv_fp8x4_e4m3* y_q_base_ptr =
       reinterpret_cast<__nv_fp8x4_e4m3*>(_y_q) + lane_id;
-  auto y_scale_base_ptr = _y_s + warp_position_scales * stride_ys_g;
+
+  Idx_t scale_group_offset = 0;
+  if constexpr (std::is_same<scale_t, uint8_t>::value) {
+    // packed int32_t format
+    int pack_id = warp_position_scales / 4;
+    int scale_in_pack = warp_position_scales % 4;
+    scale_group_offset = pack_id * stride_ys_p + scale_in_pack * stride_ys_g;
+  } else {
+    scale_group_offset = warp_position_scales * stride_ys_g;
+  }
+
+  scale_t* const y_scale_base_ptr = _y_s + scale_group_offset;
 
   for (auto j = tokens_lower; j < tokens_upper; j++) {
+    int current_group_id = warp_position_scales;  // Running count of which
+                                                  // group is being processed
     const Idx_t base_ys = expert_id * stride_ys_e;
     auto y_s_ptr = y_scale_base_ptr + base_ys + token_offset * stride_ys_t;
     __nv_fp8x4_e4m3* y_q_ptr =
@@ -509,7 +522,7 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
 
       __nv_bfloat16 y_s = __hmul(warp_max(_y_max2.x), fp8_inv);
 
-      if constexpr (USE_UE8M0) {
+      if constexpr (CEIL_UE8M0) {
         y_s = hexp2(hceil(hlog2(y_s)));
       }
 
@@ -527,8 +540,24 @@ __global__ void silu_mul_fp8_quant_deep_gemm_kernel(
       y_q_ptr += WARP_SIZE * stride_yq_h;
 
       if (!lane_id) {
-        *y_s_ptr = y_s;
-        y_s_ptr += stride_ys_g;
+        // Store scales.
+        if constexpr (std::is_same<scale_t, uint8_t>::value) {
+          // Packed UE8MO format. Remove Mantissa.
+          *y_s_ptr = reinterpret_cast<int16_t&>(y_s) >> 7;
+
+          bool const jump_pack = (current_group_id + 1) % 4 == 0;
+          // Minus 3 because we need to get to the first group in the
+          // next pack.
+          y_s_ptr += jump_pack ? (stride_ys_p - 3) : stride_ys_g;
+
+        } else {
+          // float32 format
+          static_assert(std::is_same<scale_t, float>::value);
+          *y_s_ptr = y_s;
+          y_s_ptr += stride_ys_g;
+        }
+
+        current_group_id += 1;
       }
     }
   }
@@ -573,7 +602,7 @@ void persistent_masked_m_silu_mul_quant(
     const at::Tensor& tokens_per_expert,  // (E)
     at::Tensor& y_q,                      // (E, T, H) [OUT]
     at::Tensor& y_s,                      // (E, T, H//group_size) [OUT]
-    bool use_ue8m0) {
+    bool cast_scale_ue8m0) {
 #ifndef USE_ROCM
 
   // This kernel currently only supports H % 128 == 0 and assumes a
@@ -583,9 +612,12 @@ void persistent_masked_m_silu_mul_quant(
   TORCH_CHECK(input.dtype() == torch::kBFloat16);
   TORCH_CHECK(y_q.dtype() == torch::kFloat8_e4m3fn ||
               y_q.dtype() == torch::kFloat8_e4m3fnuz);
-  TORCH_CHECK(y_s.dtype() == torch::kFloat32);
   TORCH_CHECK(input.size(-1) % (GROUP_SIZE * 2) == 0);
 
+  bool const is_packed_ue8m0 =
+      (y_s.dtype() == torch::kInt32 && cast_scale_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kFloat32 || is_packed_ue8m0);
+
   using Idx_t = int64_t;
 
   Idx_t E = input.size(0);
@@ -597,15 +629,18 @@ void persistent_masked_m_silu_mul_quant(
   Idx_t stride_yq_e = y_q.stride(0);
   Idx_t stride_yq_t = y_q.stride(1);
   Idx_t stride_yq_h = y_q.stride(2);
-  Idx_t stride_ys_e = y_s.stride(0);
-  Idx_t stride_ys_t = y_s.stride(1);
-  Idx_t stride_ys_g = y_s.stride(2);
 
   Idx_t stride_counts_e = tokens_per_expert.stride(0);
 
+  int const NUM_GROUPS = H / GROUP_SIZE;
+
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  #define KERNEL(BLOCK_COUNT, USE_UE8M0, THREAD_COUNT, STAGES)                 \
+  // TODO: Get this from cuda_arch ?
+  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
+
+  #define KERNEL(BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,  \
+                 STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, STAGES)                \
     static constexpr int NUM_WARPS = THREAD_COUNT / WARP_SIZE;                 \
     int sms = SILU_V2_BLOCK_COUNT;                                             \
     static constexpr int max_shared_mem_bytes =                                \
@@ -615,43 +650,86 @@ void persistent_masked_m_silu_mul_quant(
     VLLM_DISPATCH_FP8_TYPES(                                                   \
         y_q.scalar_type(), "silu_mul_fp8_quant_deep_gemm_kernel", [&] {        \
           vllm::silu_mul_fp8_quant_deep_gemm_kernel<                           \
-              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, THREAD_COUNT, Idx_t,   \
-              USE_UE8M0, GROUP_SIZE, STAGES>                                   \
+              BLOCK_COUNT, max_shared_mem_bytes, fp8_t, scale_t, THREAD_COUNT, \
+              Idx_t, CEIL_UE8M0, GROUP_SIZE, STAGES>                           \
               <<<grid, block, max_shared_mem_bytes + (E + 1) * 16, stream>>>(  \
                   reinterpret_cast<__nv_bfloat16*>(input.data_ptr()),          \
-                  (fp8_t*)y_q.data_ptr(), y_s.data_ptr<float>(),               \
+                  (fp8_t*)y_q.data_ptr(),                                      \
+                  reinterpret_cast<scale_t*>(y_s.data_ptr()),                  \
                   reinterpret_cast<int32_t*>(tokens_per_expert.data_ptr()), E, \
                   T, H, stride_i_e, stride_i_t, stride_i_h, stride_yq_e,       \
-                  stride_yq_t, stride_yq_h, stride_ys_e, stride_ys_t,          \
-                  stride_ys_g, stride_counts_e);                               \
+                  stride_yq_t, stride_yq_h, STRIDE_YS_E, STRIDE_YS_T,          \
+                  STRIDE_YS_G, STRIDE_YS_P, stride_counts_e);                  \
         });
 
-  static constexpr int SILU_V2_BLOCK_COUNT = 132 * 32;
-
-  int const NUM_GROUPS = H / GROUP_SIZE;
-  if (!use_ue8m0) {
-    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
-      /* 8 warps config */
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, NUM_STAGES);
-    } else {
-      /* 1 warp config */
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, false, THREAD_COUNT, 2);
-    }
-  } else {
-    if (H >= 4096 && (NUM_GROUPS % 8 == 0)) {
-      /* 8 warps config */
-      static constexpr int NUM_STAGES = 4;
-      static constexpr int THREAD_COUNT = 256;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, NUM_STAGES);
-    } else {
-      /* 1 warp config */
-      static constexpr int THREAD_COUNT = 32;
-      KERNEL(SILU_V2_BLOCK_COUNT, true, THREAD_COUNT, 2);
+  #define LAUNCH_ON_H(scale_t, STRIDE_YS_E, STRIDE_YS_T, STRIDE_YS_G,         \
+                      STRIDE_YS_P, CEIL_UE8M0)                                \
+    if (H >= 4096 && (NUM_GROUPS % 8) == 0) {                                 \
+      /* 8 warp config */                                                     \
+      static constexpr int NUM_STAGES = 4;                                    \
+      static constexpr int THREAD_COUNT = 256;                                \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, NUM_STAGES); \
+    } else {                                                                  \
+      /* 1 warp config */                                                     \
+      static constexpr int THREAD_COUNT = 32;                                 \
+      KERNEL(SILU_V2_BLOCK_COUNT, scale_t, STRIDE_YS_E, STRIDE_YS_T,          \
+             STRIDE_YS_G, STRIDE_YS_P, CEIL_UE8M0, THREAD_COUNT, 2);          \
     }
+
+  Idx_t stride_ys_e = y_s.stride(0);
+  Idx_t stride_ys_t = y_s.stride(1);
+  Idx_t stride_ys_g = y_s.stride(2);
+  Idx_t stride_ys_p = 0;
+  if (!cast_scale_ue8m0) {
+    TORCH_CHECK(!is_packed_ue8m0);
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                false);
+    return;
+  }
+
+  if (!is_packed_ue8m0) {
+    // UE8M0 but not packed
+    LAUNCH_ON_H(float, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+                true);
+    return;
   }
 
+  TORCH_CHECK(cast_scale_ue8m0 && is_packed_ue8m0);
+  TORCH_CHECK(y_s.dtype() == torch::kInt32);
+
+  // Int32 packed ue8m0 scales tensor.
+  // Let E, T, G be the number to experts, number of tokens and number of groups
+  // respectively. Let, E = 2, T = 4, G = 6, in this case the int32 scales
+  // tensor are of shape [1, 4, 2] and stride [8, 1, 4]. The scales are expected
+  // to be arranged as follows,
+  // [[T0G0-T0G1-T0G2-T0G3, T0G4-T0G5-X-X,],
+  //  [T1G0-T1G1-T1G2-T1G3, T1G4-T1G5-X-X,]
+  //  [T2G0-T2G1-T2G2-T2G3, T2G4-T2G5-X-X,]
+  //  [T3G0-T3G1-T3G2-T3G3, T3G4-T3G5-X-X,]]
+  // where, TxGy is the scale ue8m0 scale value of Token x, Group y.
+  //
+  // In memory (in bytes) the scale values are arranged as,
+  //  [T0G0, T0G1, T0G2, T0G3, T1G0, T1G2, T1G3, T1G4, T2G0, T2G1, T2G3, T2G4,
+  //   T3G0, T3G1, T3G2, T3G3, T0G4, T0G5, X, X, T1G4, T1G5, X, X, T2G4, T2G5,
+  //   X, X, T3G4, T3G5, X, X]
+  //
+  // An Int32 tensor of size [1, 4, 2] and stride [8, 1, 4] can be represented
+  // as an uint8 tensor of shape [1, 2, 4, 4] and stride [32, 16, 4, 1]. In
+  // english, ignoring the Experts dimension, the original int32 tensor is
+  // simply treated as two packed [4, 4] uint8 tensor (or two [4, 1] int32
+  // tensor). The following strides setting reflects this change. Caveat: This
+  // means that the G dimension is no longer contiguous. i.e. Note that to move
+  // from G3 to G4, we need to jump along the packing dimension. The kernel
+  // handles this case.
+
+  stride_ys_e *= sizeof(int32_t);
+  stride_ys_p = T * sizeof(int32_t);  // Packing dimension
+  stride_ys_t = sizeof(int32_t);
+  stride_ys_g = 1;
+
+  LAUNCH_ON_H(uint8_t, stride_ys_e, stride_ys_t, stride_ys_g, stride_ys_p,
+              true);
+
 #endif
 }
diff --git a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
index 5b007e5ea328..674440278383 100644
--- a/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
+++ b/csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu
@@ -22,6 +22,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
+#include "cutlass_extensions/common.hpp"
 
 #include "cute/tensor.hpp"
 #include "cutlass/tensor_ref.h"
@@ -173,7 +174,7 @@ void run_get_group_gemm_starts(
 }
 
 template <typename OutType>
-void run_fp4_blockwise_scaled_group_mm(
+void run_fp4_blockwise_scaled_group_mm_sm100(
     torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
@@ -343,17 +344,225 @@ void run_fp4_blockwise_scaled_group_mm(
 
   auto can_implement_status = gemm_op.can_implement(args);
   TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
-              "Failed to implement GEMM");
+              "Failed to implement GEMM: status=", (int)can_implement_status);
+
+  // Run the GEMM
+  auto status = gemm_op.initialize(args, workspace.data_ptr());
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
+
+  status = gemm_op.run(args, workspace.data_ptr(), stream);
+  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
+}
+
+void run_fp4_blockwise_scaled_group_mm_sm120(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  using ProblemShape =
+      cutlass::gemm::GroupProblemShape<Shape<int32_t, int32_t, int32_t>>;
+  using ElementType = cutlass::float_e2m1_t;
+  using ElementSFType = cutlass::float_ue4m3_t;
+  using ElementA = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+  using ElementB = cutlass::nv_float4_t<cutlass::float_e2m1_t>;
+
+  // NOTE: For SM120 it seems templating the output type is not supported and
+  // we need to hardcode the output type to bfloat16
+  using ElementC = cutlass::bfloat16_t;
+  using ElementD = ElementC;
+  using ElementAccumulator = float;
+  // Layout definitions
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = LayoutC;
+
+  // Alignment constraints
+  static constexpr int AlignmentA = 32;
+  static constexpr int AlignmentB = 32;
+  static constexpr int AlignmentC = 128 / cutlass::sizeof_bits<ElementC>::value;
+  static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
+
+  // Architecture definitions
+  using ArchTag = cutlass::arch::Sm120;
+  using OperatorClass = cutlass::arch::OpClassBlockScaledTensorOp;
+
+  using ClusterShape = Shape<_1, _1, _1>;
+  using MmaTileShape = Shape<_128, _128, _128>;
+
+  using FusionOperation = cutlass::epilogue::fusion::LinearCombination<
+      ElementD, ElementAccumulator, ElementC, ElementAccumulator>;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, MmaTileShape, ClusterShape,
+          cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator,
+          ElementAccumulator, ElementC, LayoutC*, AlignmentC, ElementD,
+          LayoutD*, AlignmentD,
+          cutlass::epilogue::collective::EpilogueScheduleAuto,
+          FusionOperation>::CollectiveOp;
+
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          ArchTag, OperatorClass, ElementA, LayoutA*, AlignmentA, ElementB,
+          LayoutB*, AlignmentB, ElementAccumulator, MmaTileShape, ClusterShape,
+          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
+              sizeof(typename CollectiveEpilogue::SharedStorage))>,
+          cutlass::gemm::collective::KernelScheduleAuto>::CollectiveOp;
+
+  using GemmKernel =
+      cutlass::gemm::kernel::GemmUniversal<ProblemShape, CollectiveMainloop,
+                                           CollectiveEpilogue>;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  using StrideA = typename Gemm::GemmKernel::InternalStrideA;
+  using StrideB = typename Gemm::GemmKernel::InternalStrideB;
+  using StrideC = typename Gemm::GemmKernel::InternalStrideC;
+  using StrideD = typename Gemm::GemmKernel::InternalStrideD;
+
+  using LayoutSFA =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFA;
+  using LayoutSFB =
+      typename Gemm::GemmKernel::CollectiveMainloop::InternalLayoutSFB;
+  using ScaleConfig =
+      typename Gemm::GemmKernel::CollectiveMainloop::Sm1xxBlkScaledConfig;
+
+  using UnderlyingProblemShape = ProblemShape::UnderlyingProblemShape;
+  int num_experts = static_cast<int>(expert_offsets.size(0));
+  auto options_int =
+      torch::TensorOptions().dtype(torch::kInt64).device(a.device());
+
+  torch::Tensor a_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor out_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor alpha_ptrs = torch::empty(num_experts, options_int);
+  torch::Tensor layout_sfa = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor layout_sfb = torch::empty({num_experts, 5}, options_int);
+  torch::Tensor c_strides1 =
+      torch::full({num_experts}, output.stride(0), options_int);
+  torch::Tensor a_strides1 =
+      torch::full({num_experts}, a.stride(0) * 2, options_int);
+  torch::Tensor b_strides1 =
+      torch::full({num_experts}, b.stride(1) * 2, options_int);
+
+  run_get_group_gemm_starts<LayoutSFA, LayoutSFB, ScaleConfig>(
+      a_ptrs, b_ptrs, out_ptrs, a_scales_ptrs, b_scales_ptrs, alpha_ptrs,
+      layout_sfa, layout_sfb, a, b, output, a_blockscale, b_blockscales, alphas,
+      expert_offsets, sf_offsets, problem_sizes, M, N, K);
+
+  // Create an instance of the GEMM
+  Gemm gemm_op;
+
+  // Initialize problem_sizes_as_shapes correctly
+  UnderlyingProblemShape* problem_sizes_as_shapes =
+      static_cast<UnderlyingProblemShape*>(problem_sizes.data_ptr());
+
+  // Set the Scheduler info
+  cutlass::KernelHardwareInfo hw_info;
+  using RasterOrderOptions = cutlass::gemm::kernel::detail::RasterOrderOptions;
+  typename Gemm::GemmKernel::TileSchedulerArguments scheduler;
+  scheduler.raster_order = RasterOrderOptions::AlongM;
+  hw_info.device_id = a.get_device();
+  static std::unordered_map<int, int> cached_sm_counts;
+  if (cached_sm_counts.find(hw_info.device_id) == cached_sm_counts.end()) {
+    cached_sm_counts[hw_info.device_id] =
+        cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+            hw_info.device_id);
+  }
+  hw_info.sm_count = min(cached_sm_counts[hw_info.device_id], INT_MAX);
+
+  // Mainloop Arguments
+  typename GemmKernel::MainloopArguments mainloop_args{
+      static_cast<const ElementType**>(a_ptrs.data_ptr()),
+      static_cast<StrideA*>(a_strides1.data_ptr()),
+      static_cast<const ElementType**>(b_ptrs.data_ptr()),
+      static_cast<StrideB*>(b_strides1.data_ptr()),
+      static_cast<const ElementSFType**>(a_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFA*>(layout_sfa.data_ptr()),
+      static_cast<const ElementSFType**>(b_scales_ptrs.data_ptr()),
+      reinterpret_cast<LayoutSFB*>(layout_sfb.data_ptr())};
+
+  // Epilogue Arguments
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      {},  // epilogue.thread
+      nullptr,
+      static_cast<StrideC*>(c_strides1.data_ptr()),
+      static_cast<ElementD**>(out_ptrs.data_ptr()),
+      static_cast<StrideC*>(c_strides1.data_ptr())};
+  auto& fusion_args = epilogue_args.thread;
+  fusion_args.alpha_ptr_array =
+      reinterpret_cast<float**>(alpha_ptrs.data_ptr());
+  fusion_args.dAlpha = {_0{}, _0{}, 1};
+  fusion_args.beta = 0.0f;
+
+  // Gemm Arguments
+  typename GemmKernel::Arguments args{
+      cutlass::gemm::GemmUniversalMode::kGrouped,
+      {num_experts, problem_sizes_as_shapes, nullptr},
+      mainloop_args,
+      epilogue_args,
+      hw_info,
+      scheduler};
+
+  size_t workspace_size = Gemm::get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  auto can_implement_status = gemm_op.can_implement(args);
+  TORCH_CHECK(can_implement_status == cutlass::Status::kSuccess,
+              "Failed to implement GEMM: status=", (int)can_implement_status);
 
   // Run the GEMM
   auto status = gemm_op.initialize(args, workspace.data_ptr());
-  TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to initialize GEMM");
+  TORCH_CHECK(status == cutlass::Status::kSuccess,
+              "Failed to initialize GEMM: status=", (int)status,
+              " workspace_size=", workspace_size, " num_experts=", num_experts,
+              " M=", M, " N=", N, " K=", K);
 
   status = gemm_op.run(args, workspace.data_ptr(), stream);
   TORCH_CHECK(status == cutlass::Status::kSuccess, "Failed to run GEMM");
 }
 
+template <typename OutType>
+void run_fp4_blockwise_scaled_group_mm(
+    torch::Tensor& output, const torch::Tensor& a, const torch::Tensor& b,
+    const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
+    const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
+    const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets, int M,
+    int N, int K) {
+  int32_t version_num = get_sm_version_num();
+#if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+  if (version_num >= 120 && version_num < 130) {
+    run_fp4_blockwise_scaled_group_mm_sm120(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
 #if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+  if (version_num >= 100 && version_num < 120) {
+    run_fp4_blockwise_scaled_group_mm_sm100<OutType>(
+        output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
+        expert_offsets, sf_offsets, M, N, K);
+    return;
+  }
+#endif
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      false,
+      "No compiled cutlass_fp4_group_mm kernel for CUDA device capability: ",
+      version_num, ". Required capability: 100 or 120");
+}
+
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
 constexpr auto FLOAT4_E2M1X2 = at::ScalarType::Byte;
 constexpr auto SF_DTYPE = at::ScalarType::Float8_e4m3fn;
 #endif
@@ -374,7 +583,8 @@ void cutlass_fp4_group_mm(
     const torch::Tensor& a_blockscale, const torch::Tensor& b_blockscales,
     const torch::Tensor& alphas, const torch::Tensor& problem_sizes,
     const torch::Tensor& expert_offsets, const torch::Tensor& sf_offsets) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
+#if (defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100) || \
+    (defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120)
   // Input validation
   CHECK_INPUT(a, FLOAT4_E2M1X2, "a");
   CHECK_INPUT(b, FLOAT4_E2M1X2, "b");
@@ -408,6 +618,14 @@ void cutlass_fp4_group_mm(
         output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
         expert_offsets, sf_offsets, M, N, K);
   } else {
+  #if defined ENABLE_NVFP4_SM120 && ENABLE_NVFP4_SM120
+    int32_t version_num = get_sm_version_num();
+    if (version_num >= 120 && version_num < 130) {
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "SM120 NVFP4 MOE only supports bfloat16 output, got: ",
+          output.scalar_type());
+    }
+  #endif
     run_fp4_blockwise_scaled_group_mm<cutlass::half_t>(
         output, a, b, a_blockscale, b_blockscales, alphas, problem_sizes,
         expert_offsets, sf_offsets, M, N, K);
@@ -416,8 +634,8 @@ void cutlass_fp4_group_mm(
   TORCH_CHECK_NOT_IMPLEMENTED(
       false,
       "No compiled cutlass_fp4_group_mm kernel, vLLM must "
-      "be compiled with ENABLE_NVFP4_SM100 for SM100+ and CUDA "
-      "12.8 or above.");
+      "be compiled with ENABLE_NVFP4_SM100 or ENABLE_NVFP4_SM120 for SM100/120 "
+      "and CUDA 12.8 or above.");
 #endif
 }
 
diff --git a/csrc/quantization/fp4/nvfp4_experts_quant.cu b/csrc/quantization/fp4/nvfp4_experts_quant.cu
index 6d385e0dd94e..82c53c2375a3 100644
--- a/csrc/quantization/fp4/nvfp4_experts_quant.cu
+++ b/csrc/quantization/fp4/nvfp4_experts_quant.cu
@@ -307,7 +307,7 @@ constexpr auto FLOAT = at::ScalarType::Float;
 constexpr auto INT = at::ScalarType::Int;
 constexpr auto UINT8 = at::ScalarType::Byte;
 
-void scaled_fp4_experts_quant_sm100a(
+void scaled_fp4_experts_quant_sm1xxa(
     torch::Tensor& output, torch::Tensor& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
diff --git a/csrc/quantization/fp4/nvfp4_quant_entry.cu b/csrc/quantization/fp4/nvfp4_quant_entry.cu
index c2b39e543880..fb6d22f035b9 100644
--- a/csrc/quantization/fp4/nvfp4_quant_entry.cu
+++ b/csrc/quantization/fp4/nvfp4_quant_entry.cu
@@ -24,8 +24,9 @@ void scaled_fp4_quant_sm1xxa(torch::Tensor const& output,
                              torch::Tensor const& input_sf);
 #endif
 
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-void scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+void scaled_fp4_experts_quant_sm1xxa(
     torch::Tensor& output, torch::Tensor& output_scale,
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
@@ -54,8 +55,9 @@ void scaled_fp4_experts_quant(
     torch::Tensor const& input, torch::Tensor const& input_global_scale,
     torch::Tensor const& input_offset_by_experts,
     torch::Tensor const& output_scale_offset_by_experts) {
-#if defined ENABLE_NVFP4_SM100 && ENABLE_NVFP4_SM100
-  return scaled_fp4_experts_quant_sm100a(
+#if (defined(ENABLE_NVFP4_SM100) && ENABLE_NVFP4_SM100) || \
+    (defined(ENABLE_NVFP4_SM120) && ENABLE_NVFP4_SM120)
+  return scaled_fp4_experts_quant_sm1xxa(
       output, output_scale, input, input_global_scale, input_offset_by_experts,
       output_scale_offset_by_experts);
 #endif
diff --git a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
index 03bd5964a7fc..e306ff02605b 100644
--- a/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
+++ b/csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu
@@ -437,10 +437,10 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
       for (int n_idx = 0; n_idx < WARP_NITER; ++n_idx) {
   #pragma unroll
         for (int k_idx = 0; k_idx < 2; ++k_idx) {
-          FType low16 =
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2]);
-          FType high16 =
-              ScalarType<FType>::float2num(C_frag[m_idx][n_idx][k_idx * 2 + 1]);
+          FType low16 = MarlinScalarType2<FType>::float2num(
+              C_frag[m_idx][n_idx][k_idx * 2]);
+          FType high16 = MarlinScalarType2<FType>::float2num(
+              C_frag[m_idx][n_idx][k_idx * 2 + 1]);
           uint32_t tmp = (reinterpret_cast<uint32_t&>(low16) & 0xffff) |
                          (reinterpret_cast<uint32_t&>(high16) << 16);
           int sts_offset =
diff --git a/csrc/quantization/gptq_allspark/allspark_utils.cuh b/csrc/quantization/gptq_allspark/allspark_utils.cuh
index 831413016538..14a61ad8fd88 100644
--- a/csrc/quantization/gptq_allspark/allspark_utils.cuh
+++ b/csrc/quantization/gptq_allspark/allspark_utils.cuh
@@ -8,7 +8,7 @@
 #include <cuda_bf16.h>
 #include <iostream>
 #include "../gptq_marlin/marlin_dtypes.cuh"
-using marlin::ScalarType;
+using marlin::MarlinScalarType2;
 
 namespace allspark {
 
@@ -72,10 +72,10 @@ __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
 
   int n_mat = N_MATRIX > 0 ? N_MATRIX : (int)n_matrix;
   for (int i = 0; i < n_mat; ++i) {
-    sum += ScalarType<FType>::num2float(C_split[idx + i * matrix_size]);
+    sum += MarlinScalarType2<FType>::num2float(C_split[idx + i * matrix_size]);
   }
 
-  C[idx] = ScalarType<FType>::float2num(sum);
+  C[idx] = MarlinScalarType2<FType>::float2num(sum);
 }
 
 template <typename FType>
diff --git a/csrc/quantization/gptq_marlin/.gitignore b/csrc/quantization/gptq_marlin/.gitignore
index 77088552b85b..ba805f9250ec 100644
--- a/csrc/quantization/gptq_marlin/.gitignore
+++ b/csrc/quantization/gptq_marlin/.gitignore
@@ -1 +1,2 @@
-kernel_*.cu
\ No newline at end of file
+sm*_kernel_*.cu
+kernel_selector.h
diff --git a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
index 8ba617a9e655..307bae6738ec 100644
--- a/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/awq_marlin_repack.cu
@@ -4,14 +4,16 @@
 
 namespace marlin {
 
-template <int const num_threads, int const num_bits>
+template <int const num_threads, int const num_bits, bool is_a_8bit>
 __global__ void awq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {
   constexpr int pack_factor = 32 / num_bits;
 
-  int k_tiles = size_k / tile_k_size;
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
   auto start_k_tile = blockIdx.x * block_k_tiles;
@@ -33,10 +35,10 @@ __global__ void awq_marlin_repack_kernel(
 
   extern __shared__ int4 sh[];
 
-  constexpr int tile_n_ints = tile_n_size / pack_factor;
+  constexpr int tile_n_ints = target_tile_n_size / pack_factor;
 
   constexpr int stage_n_threads = tile_n_ints / 4;
-  constexpr int stage_k_threads = tile_k_size;
+  constexpr int stage_k_threads = target_tile_k_size;
   constexpr int stage_size = stage_k_threads * stage_n_threads;
 
   auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
@@ -45,7 +47,7 @@ __global__ void awq_marlin_repack_kernel(
       return;
     }
 
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
     int first_n_packed = first_n / pack_factor;
 
     int4* sh_ptr = sh + stage_size * pipe;
@@ -54,7 +56,7 @@ __global__ void awq_marlin_repack_kernel(
       auto k_id = threadIdx.x / stage_n_threads;
       auto n_id = threadIdx.x % stage_n_threads;
 
-      int first_k = k_tile_id * tile_k_size;
+      int first_k = k_tile_id * target_tile_k_size;
 
       cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
                 reinterpret_cast<int4 const*>(
@@ -78,11 +80,11 @@ __global__ void awq_marlin_repack_kernel(
     }
 
     int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
 
     constexpr int tc_offsets[4] = {0, 1, 8, 9};
 
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
     int cur_n_packed = cur_n / pack_factor;
     int cur_n_pos = cur_n % pack_factor;
 
@@ -105,23 +107,50 @@ __global__ void awq_marlin_repack_kernel(
     uint32_t vals[8];
 #pragma unroll
     for (int i = 0; i < 4; i++) {
-      int cur_elem = tc_row + tc_offsets[i];
-
-      int packed_src_0 = sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
-      int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
-                                          sh_stride * cur_elem];
-
-      vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
-      vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      if constexpr (is_a_8bit) {
+        int cur_elem = tc_row + i;
+
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * cur_elem];
+        int packed_src_1 =
+            sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) * (warp_id % 2) +
+                             sh_stride * (cur_elem + 16)];
+
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      } else {
+        int cur_elem = tc_row + tc_offsets[i];
+
+        int packed_src_0 =
+            sh_stage_int_ptr[cur_n_packed + sh_stride * cur_elem];
+        int packed_src_1 = sh_stage_int_ptr[cur_n_packed + (8 / pack_factor) +
+                                            sh_stride * cur_elem];
+
+        vals[i] = (packed_src_0 >> (cur_n_pos_unpacked * num_bits)) & mask;
+        vals[4 + i] = (packed_src_1 >> (cur_n_pos_unpacked * num_bits)) & mask;
+      }
     }
 
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
     int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
 
     // Result of:
     // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 
       uint32_t res = 0;
 #pragma unroll
@@ -138,8 +167,9 @@ __global__ void awq_marlin_repack_kernel(
       uint32_t res2 = 0;
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
       }
 
       out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@@ -176,18 +206,21 @@ __global__ void awq_marlin_repack_kernel(
 
 }  // namespace marlin
 
-#define CALL_IF(NUM_BITS)                                                   \
-  else if (num_bits == NUM_BITS) {                                          \
-    cudaFuncSetAttribute(                                                   \
-        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>, \
-        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
-    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS>      \
-        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
-            b_q_weight_ptr, out_ptr, size_k, size_n);                       \
+#define CALL_IF(NUM_BITS, IS_A_8BIT)                                       \
+  else if (num_bits == NUM_BITS && is_a_8bit == IS_A_8BIT) {               \
+    cudaFuncSetAttribute(                                                  \
+        marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
+                                         IS_A_8BIT>,                       \
+        cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);      \
+    marlin::awq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
+                                     IS_A_8BIT>                            \
+        <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(      \
+            b_q_weight_ptr, out_ptr, size_k, size_n);                      \
   }
 
 torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
-                                int64_t size_n, int64_t num_bits) {
+                                int64_t size_n, int64_t num_bits,
+                                bool is_a_8bit) {
   // Verify compatibility with marlin tile of 16x64
   TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
               " is not divisible by tile_k_size = ", marlin::tile_k_size);
@@ -238,31 +271,18 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
 
   if (false) {
   }
-  CALL_IF(4)
-  CALL_IF(8)
+  CALL_IF(4, false)
+  CALL_IF(8, false)
+  CALL_IF(4, true)
+  CALL_IF(8, true)
   else {
-    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits);
+    TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
+                ", is_a_8bit = ", is_a_8bit);
   }
 
   return out;
 }
 
-torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                     c10::SymInt size_k, c10::SymInt size_n,
-                                     int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("awq_marlin_repack", &awq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("awq_marlin_repack", &awq_marlin_repack_meta);
-}
diff --git a/csrc/quantization/gptq_marlin/dequant.h b/csrc/quantization/gptq_marlin/dequant.h
index e8b0c302b202..26b8d40368aa 100644
--- a/csrc/quantization/gptq_marlin/dequant.h
+++ b/csrc/quantization/gptq_marlin/dequant.h
@@ -470,6 +470,50 @@ __device__ inline void dequant<nv_bfloat162, vllm::kFE2M1f.id(), false>(
   frag_b[0] = __hmul2(frag_b[0], bias_reg);
 }
 
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kFE2M1f.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  // Constants for FP4 (E2M1) and FP16 formats
+  constexpr int FP4_EXPONENT = 2, FP8_EXPONENT = 4;
+  constexpr int RIGHT_SHIFT = FP8_EXPONENT - FP4_EXPONENT;
+  constexpr int MASK = 0x70707070;
+
+  // Extract and shift FP4 values to FP16 format
+  int Out1 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+  q <<= 4;
+  int Out2 = (q & 0x80808080) | ((q & MASK) >> RIGHT_SHIFT);
+
+  // Note1: reverse indexing is intentional because weights are permuted
+  // Note2: when dequant to 8bit type, we write to `frag_b[2]` instead of
+  //        `frag_b[1]` to fit the layout of tensorcore
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
+template <>
+__device__ inline void dequant<int32_t, vllm::kU4B8.id(), true>(
+    int q, int32_t* frag_b) {
+  constexpr int repeated_zp = 0x08080808;
+  constexpr int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void dequant<__nv_fp8x4_e4m3, vllm::kU4B8.id(), true>(
+    int q, __nv_fp8x4_e4m3* frag_b) {
+  int s = q & 0x08080808;
+  int Out1 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+  q >>= 4;
+  s = q & 0x08080808;
+  int Out2 = ((q & 0x07070707) | (s << 4)) + (s >> 3);
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
+}
+
 template <typename scalar_t2, vllm::ScalarTypeId s_type_id>
 __device__ inline void dequant_fp8_scales(int q, scalar_t2* frag_b);
 
@@ -515,6 +559,49 @@ __device__ inline void dequant_fp8_scales<nv_bfloat162, vllm::kFE8M0fnu.id()>(
   // Note: reverse indexing is intentional because weights are permuted
   frag_b[1] = *reinterpret_cast<const nv_bfloat162*>(&Out1);
   frag_b[0] = *reinterpret_cast<const nv_bfloat162*>(&Out2);
+};
+
+// subtract zero point in quanted format and then dequant
+template <typename scalar_t2, vllm::ScalarTypeId w_type_id,
+          bool skip_flop = false>
+__device__ inline void sub_zp_and_dequant(int q, scalar_t2* frag_b, int zp);
+
+template <>
+__device__ inline void sub_zp_and_dequant<int32_t, vllm::kU4.id(), true>(
+    int q, int32_t* frag_b, int zp) {
+  // INT4 with zp -> INT8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  int repeated_zp = 0x01010101 * zp;
+  int MASK = 0x80808080;
+
+  frag_b[0] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+  q >>= 4;
+  frag_b[1] = ((q & 0x0F0F0F0F | MASK) - repeated_zp) ^ MASK;
+}
+
+template <>
+__device__ inline void sub_zp_and_dequant<__nv_fp8x4_e4m3, vllm::kU4.id(),
+                                          true>(int q, __nv_fp8x4_e4m3* frag_b,
+                                                int zp) {
+  // INT4 with zp -> FP8
+  // see https://github.com/vllm-project/vllm/pull/24722
+  uint32_t u_q = *reinterpret_cast<uint32_t*>(&q);
+  uint32_t u_zp = *reinterpret_cast<uint32_t*>(&zp);
+  uint32_t u_zp1 = u_zp + 1;
+  uint32_t repeated_zp = 0x01010101 * u_zp;
+
+  uint32_t q0, s;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out1 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  u_q >>= 4;
+  q0 = (u_q & 0x0F0F0F0F) | 0x70707070;
+  s = (q0 + repeated_zp) & 0x80808080;
+  uint32_t Out2 = (q0 + (s >> 7) * u_zp1) & 0x0F0F0F0F | s;
+
+  frag_b[0] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out1);
+  frag_b[1] = *reinterpret_cast<const __nv_fp8x4_e4m3*>(&Out2);
 }
 
 #endif
diff --git a/csrc/quantization/gptq_marlin/generate_kernels.py b/csrc/quantization/gptq_marlin/generate_kernels.py
index 42d3b456096e..27ef7271ba41 100644
--- a/csrc/quantization/gptq_marlin/generate_kernels.py
+++ b/csrc/quantization/gptq_marlin/generate_kernels.py
@@ -4,141 +4,292 @@
 import itertools
 import os
 import subprocess
+import sys
 
 import jinja2
 
-FILE_HEAD = """
-// auto generated by generate.py
+ARCHS = []
+SUPPORT_FP8 = False
+for arch in sys.argv[1].split(","):
+    arch = arch[: arch.index(".") + 2].replace(".", "")
+    arch = int(arch)
+    # only SM89 and SM120 fully support
+    # mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32.
+    # SM90 and SM100 can use this PTX, but it’s simulated
+    # with FP16 MMA, so it cannot achieve any acceleration.
+    if arch in [89, 120]:
+        SUPPORT_FP8 = True
+
+FILE_HEAD_COMMENT = """
+// auto generated by generate_kernels.py
 // clang-format off
+""".lstrip()
 
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "kernel.h"
 #include "marlin_template.h"
 
 namespace MARLIN_NAMESPACE_NAME {
-""".strip()
+"""
+)
 
 TEMPLATE = (
     "template __global__ void Marlin<"
-    "{{scalar_t}}, "
-    "{{w_type_id}}, "
+    "{{a_type_id}}, "
+    "{{b_type_id}}, "
+    "{{c_type_id}}, "
     "{{s_type_id}}, "
     "{{threads}}, "
     "{{thread_m_blocks}}, "
     "{{thread_n_blocks}}, "
     "{{thread_k_blocks}}, "
-    "{{'true' if m_block_size_8 else 'false'}}, "
+    "{{m_block_size_8}}, "
     "{{stages}}, "
     "{{group_blocks}}, "
-    "{{'true' if is_zp_float else 'false'}}>"
+    "{{is_zp_float}}>"
     "( MARLIN_KERNEL_PARAMS );"
 )
 
-# int8 with zero point case (vllm::kU8) is also supported,
-# we don't add it to reduce wheel size.
-SCALAR_TYPES = [
-    "vllm::kU4",
-    "vllm::kU4B8",
-    "vllm::kU8B128",
-    "vllm::kFE4M3fn",
-    "vllm::kFE2M1f",
-]
 THREAD_CONFIGS = [(128, 128, 256), (64, 256, 256), (64, 128, 128), (128, 64, 128)]
 
 THREAD_M_BLOCKS = [0.5, 1, 2, 3, 4]
-# group_blocks:
-#   = 0 : act order case
-#   = -1 : channelwise quantization
-#   > 0 : group_size=16*group_blocks
-GROUP_BLOCKS = [0, 1, -1, 2, 4, 8]
-DTYPES = ["fp16", "bf16"]
+
+QUANT_CONFIGS = [
+    # AWQ-INT4
+    {
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # HQQ
+    {
+        "a_type": ["kFloat16"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [4],
+        "is_zp_float": True,
+    },
+    # GPTQ-INT4
+    {
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": "kU8B128",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 0, 2, 4, 8],
+    },
+    # FP8
+    {
+        "b_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [-1, 8],
+    },
+    # NVFP4
+    {
+        "b_type": "kFE2M1f",
+        "s_type": "kFE4M3fn",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [1],
+    },
+    # MXFP4
+    {
+        "a_type": ["kBFloat16"],
+        "b_type": "kFE2M1f",
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": THREAD_M_BLOCKS,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": ["kS8"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4B8",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kU4",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": ["kFE4M3fn"],
+        "b_type": "kFE2M1f",
+        "c_type": ["kBFloat16"],
+        "s_type": "kFE8M0fnu",
+        "thread_configs": THREAD_CONFIGS,
+        "thread_m_blocks": [1, 2, 3, 4],
+        "group_blocks": [2],
+    },
+]
 
 
 def remove_old_kernels():
-    for filename in glob.glob(os.path.dirname(__file__) + "/kernel_*.cu"):
+    for filename in glob.glob(os.path.dirname(__file__) + "/*kernel_*.cu"):
         subprocess.call(["rm", "-f", filename])
 
+    filename = os.path.dirname(__file__) + "/kernel_selector.h"
+    subprocess.call(["rm", "-f", filename])
+
 
 def generate_new_kernels():
-    for scalar_type, dtype in itertools.product(SCALAR_TYPES, DTYPES):
-        all_template_str_list = []
+    result_dict = {}
 
-        for group_blocks, m_blocks, thread_configs in itertools.product(
-            GROUP_BLOCKS, THREAD_M_BLOCKS, THREAD_CONFIGS
-        ):
-            # act order case only support gptq-int4 and gptq-int8
-            if group_blocks == 0 and scalar_type not in [
-                "vllm::kU4B8",
-                "vllm::kU8B128",
-            ]:
-                continue
-            if thread_configs[2] == 256:
-                # for small batch (m_blocks == 1), we only need (128, 128, 256)
-                # for large batch (m_blocks > 1), we only need (64, 256, 256)
-                if m_blocks <= 1 and thread_configs[0] != 128:
-                    continue
-                if m_blocks > 1 and thread_configs[0] != 64:
-                    continue
-
-            # we only support channelwise quantization and group_size == 128
-            # for fp8
-            if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
-                continue
-            # nvfp4 only supports group_size == 16
-            # mxfp4 only supports group_size == 32
-            if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
+    for quant_config in QUANT_CONFIGS:
+        c_types = quant_config.get("c_type", ["kFloat16", "kBFloat16"])
+        a_types = quant_config.get("a_type", ["kFloat16", "kBFloat16"])
+        b_type = quant_config["b_type"]
+        is_zp_float = quant_config.get("is_zp_float", False)
+        all_group_blocks = quant_config["group_blocks"]
+        all_m_blocks = quant_config["thread_m_blocks"]
+        all_thread_configs = quant_config["thread_configs"]
+
+        for a_type, c_type in itertools.product(a_types, c_types):
+            if not SUPPORT_FP8 and a_type == "kFE4M3fn":
                 continue
-            # other quantization methods don't support group_size = 16
-            if scalar_type != "vllm::kFE2M1f" and group_blocks == 1:
+            if "16" in a_type and "16" in c_type and a_type != c_type:
                 continue
+            s_type = quant_config.get("s_type", c_type)
+            if (a_type, b_type, c_type) not in result_dict:
+                result_dict[(a_type, b_type, c_type)] = []
 
-            k_blocks = thread_configs[0] // 16
-            n_blocks = thread_configs[1] // 16
-            threads = thread_configs[2]
-
-            c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
-
-            is_zp_float_list = [False]
-            if dtype == "fp16" and scalar_type == "vllm::kU4" and group_blocks == 4:
-                # HQQ (is_zp_float = true) only supports
-                # 4bit quantization and fp16
-                is_zp_float_list.append(True)
-
-            if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
-                s_type = "vllm::kFE4M3fn"
-            elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
-                s_type = "vllm::kFE8M0fnu"
-                if dtype == "fp16":
-                    # we cannot safely dequantize e8m0 to fp16, so skip this
-                    continue
-            elif dtype == "fp16":
-                s_type = "vllm::kFloat16"
-            elif dtype == "bf16":
-                s_type = "vllm::kBFloat16"
-
-            for is_zp_float in is_zp_float_list:
-                template_str = jinja2.Template(TEMPLATE).render(
-                    scalar_t=c_dtype,
-                    w_type_id=scalar_type + ".id()",
-                    s_type_id=s_type + ".id()",
-                    threads=threads,
-                    thread_m_blocks=max(m_blocks, 1),
-                    thread_n_blocks=n_blocks,
-                    thread_k_blocks=k_blocks,
-                    m_block_size_8=m_blocks == 0.5,
-                    stages="pipe_stages",
-                    group_blocks=group_blocks,
-                    is_zp_float=is_zp_float,
-                )
+            for group_blocks, m_blocks, thread_configs in itertools.product(
+                all_group_blocks, all_m_blocks, all_thread_configs
+            ):
+                thread_k, thread_n, threads = thread_configs
+
+                if threads == 256:
+                    # for small batch (m_blocks == 1),
+                    #     we only need (128, 128, 256)
+                    # for large batch (m_blocks > 1),
+                    #     we only need (64, 256, 256)
+                    if m_blocks <= 1 and (thread_k, thread_n) != (128, 128):
+                        continue
+                    if m_blocks > 1 and (thread_k, thread_n) != (64, 256):
+                        continue
 
-                all_template_str_list.append(template_str)
+                config = {
+                    "threads": threads,
+                    "s_type": s_type,
+                    "thread_m_blocks": max(m_blocks, 1),
+                    "thread_k_blocks": thread_k // 16,
+                    "thread_n_blocks": thread_n // 16,
+                    "m_block_size_8": "true" if m_blocks == 0.5 else "false",
+                    "stages": "pipe_stages",
+                    "group_blocks": group_blocks,
+                    "is_zp_float": "true" if is_zp_float else "false",
+                }
+
+                result_dict[(a_type, b_type, c_type)].append(config)
+
+    kernel_selector_str = FILE_HEAD_COMMENT
+
+    for (a_type, b_type, c_type), config_list in result_dict.items():
+        all_template_str_list = []
+        for config in config_list:
+            s_type = config["s_type"]
+            template_str = jinja2.Template(TEMPLATE).render(
+                a_type_id=f"vllm::{a_type}.id()",
+                b_type_id=f"vllm::{b_type}.id()",
+                c_type_id=f"vllm::{c_type}.id()",
+                s_type_id=f"vllm::{s_type}.id()",
+                **config,
+            )
+            all_template_str_list.append(template_str)
+
+            conditions = [
+                f"a_type == vllm::{a_type}",
+                f"b_type == vllm::{b_type}",
+                f"c_type == vllm::{c_type}",
+                f"s_type == vllm::{s_type}",
+                f"threads == {config['threads']}",
+                f"thread_m_blocks == {config['thread_m_blocks']}",
+                f"thread_n_blocks == {config['thread_n_blocks']}",
+                f"thread_k_blocks == {config['thread_k_blocks']}",
+                f"m_block_size_8 == {config['m_block_size_8']}",
+                f"group_blocks == {config['group_blocks']}",
+                f"is_zp_float == {config['is_zp_float']}",
+            ]
+            conditions = " && ".join(conditions)
+
+            if kernel_selector_str == FILE_HEAD_COMMENT:
+                kernel_selector_str += f"if ({conditions})\n  kernel = "
+            else:
+                kernel_selector_str += f"else if ({conditions})\n  kernel = "
+
+            kernel_template2 = (
+                "Marlin<{{a_type_id}}, {{b_type_id}}, {{c_type_id}}, "
+                "{{s_type_id}}, {{threads}}, {{thread_m_blocks}}, "
+                "{{thread_n_blocks}}, {{thread_k_blocks}}, "
+                "{{m_block_size_8}}, {{stages}}, {{group_blocks}}, "
+                "{{is_zp_float}}>;"
+            )
+
+            kernel_selector_str += (
+                jinja2.Template(kernel_template2).render(
+                    a_type_id=f"vllm::{a_type}.id()",
+                    b_type_id=f"vllm::{b_type}.id()",
+                    c_type_id=f"vllm::{c_type}.id()",
+                    s_type_id=f"vllm::{s_type}.id()",
+                    **config,
+                )
+                + "\n"
+            )
 
         file_content = FILE_HEAD + "\n\n"
         file_content += "\n\n".join(all_template_str_list) + "\n\n}\n"
-        filename = f"kernel_{dtype}_{scalar_type[6:].lower()}.cu"
+        if a_type == "kFE4M3fn":
+            filename = f"sm89_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+        else:
+            filename = f"sm80_kernel_{a_type[1:]}_{b_type[1:]}_{c_type[1:]}.cu"
+
+        filename = filename.lower()
 
         with open(os.path.join(os.path.dirname(__file__), filename), "w") as f:
             f.write(file_content)
 
+    if not SUPPORT_FP8 and kernel_selector_str != FILE_HEAD_COMMENT:
+        kernel_selector_str += (
+            "else if (a_type == vllm::kFE4M3fn)\n"
+            "  TORCH_CHECK(false, "
+            '"marlin kernel with fp8 activation is not built.");'
+        )
+
+    with open(os.path.join(os.path.dirname(__file__), "kernel_selector.h"), "w") as f:
+        f.write(kernel_selector_str)
+
 
 if __name__ == "__main__":
     remove_old_kernels()
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index cc30abcf0080..28ff06559a98 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -53,7 +53,7 @@ torch::Tensor gptq_marlin_gemm(
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
     bool is_zp_float) {
   TORCH_CHECK_NOT_IMPLEMENTED(false,
@@ -243,204 +243,29 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
   int cache_size = get_kernel_cache_size(
       th_config, thread_m_blocks, prob_m, prob_n, prob_k, num_bits, group_size,
       has_act_order, is_k_full, has_zp, is_zp_float);
-  return cache_size + 512 <= max_shared_mem;
+  return cache_size <= max_shared_mem;
 }
 
-  #define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,   \
-                  M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT)      \
-    else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS &&         \
-             thread_n_blocks == THREAD_N_BLOCKS &&                             \
-             thread_k_blocks == THREAD_K_BLOCKS &&                             \
-             m_block_size_8 == M_BLOCK_SIZE_8 &&                               \
-             group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS &&     \
-             is_zp_float == IS_ZP_FLOAT) {                                     \
-      constexpr auto S_TYPE =                                                  \
-          W_TYPE == vllm::kFE2M1f                                              \
-              ? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu)         \
-              : (std::is_same<scalar_t, half>::value ? vllm::kFloat16          \
-                                                     : vllm::kBFloat16);       \
-      kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS,         \
-                      THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS,       \
-                      M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
-    }
-
-  // COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
-  //         this is the most common cases
-  // BIGGROUP: cases for big group size (group_blocks in [-1, 8])
-  // FZP: cases for float-zero-point (is_zp_float = true)
-  // ACT: cases for act order case (group_blocks == 0)
-  // FP4: cases for nvfp4(e2m1) (group_blocks == 1)
-  #define COMMON_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-                                                                          \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define COMMON_GET_IF(W_TYPE)            \
-    COMMON_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    COMMON_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    COMMON_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    COMMON_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    COMMON_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    COMMON_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define BIGGROUP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, -1, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 8, NUM_THREADS, false)   \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)   \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)  \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
-
-  #define BIGGROUP_GET_IF(W_TYPE)            \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    BIGGROUP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    BIGGROUP_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
-
-  #define NVFP4_GET_IF(W_TYPE)            \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    NVFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    NVFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  #define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)     \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
-
-  #define MXFP4_GET_IF(W_TYPE)            \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    MXFP4_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    MXFP4_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)      \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 4, NUM_THREADS, true)
-
-  #define FZP_GET_IF(W_TYPE)            \
-    FZP_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    FZP_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    FZP_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    FZP_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    FZP_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    FZP_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-  // We currently have 4-bit models only with group_blocks == 4
-  #define ACT_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)        \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS)       \
-    _GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false) \
-    _GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 0, NUM_THREADS, false)
-
-  #define ACT_GET_IF(W_TYPE)            \
-    ACT_GET_IF_M1(W_TYPE, 8, 8, 256)    \
-    ACT_GET_IF_M1(W_TYPE, 8, 4, 128)    \
-    ACT_GET_IF_M1(W_TYPE, 4, 8, 128)    \
-    ACT_GET_IF_M234(W_TYPE, 16, 4, 256) \
-    ACT_GET_IF_M234(W_TYPE, 8, 4, 128)  \
-    ACT_GET_IF_M234(W_TYPE, 4, 8, 128)
-
-template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
+MarlinFuncPtr get_marlin_kernel(
+    const vllm::ScalarType a_type, const vllm::ScalarType b_type,
+    const vllm::ScalarType c_type, const vllm::ScalarType s_type,
+    int thread_m_blocks, int thread_n_blocks, int thread_k_blocks,
+    bool m_block_size_8, bool has_act_order, bool has_zp, int group_blocks,
+    int threads, bool is_zp_float) {
+  int num_bits = b_type.size_bits();
   auto kernel = MarlinDefault;
-  if (false) {
-  }
-
-  COMMON_GET_IF(vllm::kU4)
-  COMMON_GET_IF(vllm::kU4B8)
-  COMMON_GET_IF(vllm::kU8B128)
 
-  NVFP4_GET_IF(vllm::kFE2M1f)
-
-  BIGGROUP_GET_IF(vllm::kFE4M3fn)
-
-  ACT_GET_IF(vllm::kU4B8)
-  ACT_GET_IF(vllm::kU8B128)
-
-  if (std::is_same<scalar_t, half>::value) {
-    if (false) {
-    }
-    FZP_GET_IF(vllm::kU4)
-  }
-  if (std::is_same<scalar_t, nv_bfloat16>::value) {
-    if (false) {
-    }
-    MXFP4_GET_IF(vllm::kFE2M1f)
-  }
+  #include "kernel_selector.h"
 
   return kernel;
 }
 
-template <typename scalar_t>
-exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
-                                    int prob_n, int prob_k, int thread_m_blocks,
-                                    bool m_block_size_8, int num_bits,
-                                    int group_size, bool has_act_order,
-                                    bool is_k_full, bool has_zp,
-                                    bool is_zp_float, int max_shared_mem,
-                                    int sms) {
+exec_config_t determine_exec_config(
+    const vllm::ScalarType& a_type, const vllm::ScalarType& b_type,
+    const vllm::ScalarType& c_type, const vllm::ScalarType& s_type, int prob_m,
+    int prob_n, int prob_k, int thread_m_blocks, bool m_block_size_8,
+    int num_bits, int group_size, bool has_act_order, bool is_k_full,
+    bool has_zp, bool is_zp_float, int max_shared_mem, int sms) {
   exec_config_t exec_cfg = exec_config_t{1, thread_config_t{-1, -1, -1}};
   thread_config_t* thread_configs = thread_m_blocks > 1
                                         ? large_batch_thread_configs
@@ -455,7 +280,7 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
 
     if (!is_valid_config(th_config, thread_m_blocks, prob_m, prob_n, prob_k,
                          num_bits, group_size, has_act_order, is_k_full, has_zp,
-                         is_zp_float, max_shared_mem)) {
+                         is_zp_float, max_shared_mem - 512)) {
       continue;
     }
 
@@ -468,10 +293,11 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
       group_blocks = group_size == -1 ? -1 : group_size / 16;
     }
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, th_config.thread_n / 16,
-        th_config.thread_k / 16, m_block_size_8, has_act_order, has_zp,
-        group_blocks, th_config.num_threads, is_zp_float);
+    auto kernel =
+        get_marlin_kernel(a_type, b_type, c_type, s_type, thread_m_blocks,
+                          th_config.thread_n / 16, th_config.thread_k / 16,
+                          m_block_size_8, has_act_order, has_zp, group_blocks,
+                          th_config.num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) continue;
 
@@ -485,28 +311,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
   return exec_cfg;
 }
 
-template <typename scalar_t>
 void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
-               void* s, void* s2, void* zp, void* g_idx, void* perm,
-               void* a_tmp, int prob_m, int prob_n, int prob_k, int lda,
-               void* workspace, vllm::ScalarType const& q_type, bool has_bias,
+               void* a_s, void* b_s, void* g_s, void* zp, void* g_idx,
+               void* perm, void* a_tmp, int prob_m, int prob_n, int prob_k,
+               int lda, void* workspace, vllm::ScalarType const& a_type,
+               vllm::ScalarType const& b_type, vllm::ScalarType const& c_type,
+               vllm::ScalarType const& s_type, bool has_bias,
                bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
                int group_size, int dev, cudaStream_t stream, int thread_k_init,
                int thread_n_init, int sms, bool use_atomic_add,
                bool use_fp32_reduce, bool is_zp_float) {
-  if (has_zp) {
-    TORCH_CHECK(
-        q_type == vllm::kU4 || q_type == vllm::kU8,
-        "q_type must be u4 or u8 when has_zp = True. Got = ", q_type.str());
-  } else {
-    TORCH_CHECK(
-        q_type == vllm::kU4B8 || q_type == vllm::kU8B128 ||
-            q_type == vllm::kFE4M3fn || q_type == vllm::kFE2M1f,
-        "q_type must be uint4b8, uint8b128, float8_e4m3fn or float4_e2m1f when "
-        "has_zp = False. Got = ",
-        q_type.str());
-  }
-
   TORCH_CHECK(prob_m > 0 && prob_n > 0 && prob_k > 0, "Invalid MNK = [", prob_m,
               ", ", prob_n, ", ", prob_k, "]");
 
@@ -531,19 +345,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     }
   }
 
-  int num_bits = q_type.size_bits();
+  int num_bits = b_type.size_bits();
   const int4* A_ptr = (const int4*)A;
   const int4* B_ptr = (const int4*)B;
   int4* C_ptr = (int4*)C;
   int4* C_tmp_ptr = (int4*)C_tmp;
+
   const int4* bias_ptr = (const int4*)b_bias;
-  const int4* s_ptr = (const int4*)s;
-  const uint16_t* s2_ptr = (const uint16_t*)s2;
+  const float* a_s_ptr = (const float*)a_s;
+  const int4* b_s_ptr = (const int4*)b_s;
+  const uint16_t* g_s_ptr = (const uint16_t*)g_s;
+
   const int4* zp_ptr = (const int4*)zp;
   const int* g_idx_ptr = (const int*)g_idx;
   const int* perm_ptr = (const int*)perm;
   int4* a_tmp_ptr = (int4*)a_tmp;
-
   int* locks = (int*)workspace;
 
   if (has_act_order) {
@@ -568,6 +384,21 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                          cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
   TORCH_CHECK(max_shared_mem > 0);
 
+  int major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         dev);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         dev);
+  TORCH_CHECK(major_capability * 10 + minor_capability >= 80,
+              "marlin kernel only support Ampere or newer GPUs.");
+  if (a_type == vllm::kFE4M3fn) {
+    TORCH_CHECK(
+        major_capability * 10 + minor_capability == 89 ||
+            major_capability * 10 + minor_capability == 120,
+        "Marlin W4A8-FP8 only support SM89 or SM120 device (It is slower than "
+        "Marlin W4A16 on other devices).");
+  }
+
   int max_par = 16;
   if (prob_n <= 4096) max_par = 16 * 8;
   int max_shared_mem_new = max_shared_mem;
@@ -583,7 +414,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     int thread_n = thread_n_init;
 
     int thread_m_blocks = min(div_ceil(prob_m_split, 16), max_thread_m_blocks);
-    int m_block_size_8 = prob_m_split <= 8;
+    int m_block_size_8 = prob_m_split <= 8 && a_type.size_bits() == 16;
 
     // Set thread config
     exec_config_t exec_cfg;
@@ -597,11 +428,25 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
                   " is not divisible by thread_k = ", thread_k);
     } else {
       // Auto config
-      exec_cfg = determine_exec_config<scalar_t>(
-          q_type, prob_m_split, prob_n, prob_k, thread_m_blocks, m_block_size_8,
-          num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float,
-          max_shared_mem, sms);
+      exec_cfg = determine_exec_config(
+          a_type, b_type, c_type, s_type, prob_m_split, prob_n, prob_k,
+          thread_m_blocks, m_block_size_8, num_bits, group_size, has_act_order,
+          is_k_full, has_zp, is_zp_float, max_shared_mem, sms);
       thread_tfg = exec_cfg.tb_cfg;
+      if (thread_tfg.thread_n != -1) {
+        if (prob_n / thread_tfg.thread_n *
+                div_ceil(prob_m_split, thread_m_blocks * 16) * 4 <=
+            sms) {
+          if (is_valid_config({128, 64, 128}, thread_m_blocks, prob_m_split,
+                              prob_n, prob_k, num_bits, group_size,
+                              has_act_order, is_k_full, has_zp, is_zp_float,
+                              max_shared_mem_new)) {
+            thread_tfg = {128, 64, 128};
+            exec_cfg = {1, thread_tfg};
+          }
+        }
+      }
+
       if (thread_tfg.thread_k == -1 && max_thread_m_blocks > 1) {
         max_thread_m_blocks--;
         continue;
@@ -632,10 +477,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
         ", has_zp = ", has_zp, ", is_zp_float = ", is_zp_float,
         ", max_shared_mem_new = ", max_shared_mem_new);
 
-    auto kernel = get_marlin_kernel<scalar_t>(
-        q_type, thread_m_blocks, thread_n_blocks, thread_k_blocks,
-        m_block_size_8, has_act_order, has_zp, group_blocks, num_threads,
-        is_zp_float);
+    auto kernel = get_marlin_kernel(
+        a_type, b_type, c_type, s_type, thread_m_blocks, thread_n_blocks,
+        thread_k_blocks, m_block_size_8, has_act_order, has_zp, group_blocks,
+        num_threads, is_zp_float);
 
     if (kernel == MarlinDefault) {
       TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n,
@@ -657,13 +502,15 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
     // avoid ">>>" being formatted to "> > >"
     // clang-format off
     kernel<<<blocks, num_threads, max_shared_mem_new, stream>>>(
-        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr,
+        A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, a_s_ptr, b_s_ptr, g_s_ptr, zp_ptr,
         g_idx_ptr, num_groups,
         prob_m_split, prob_n, prob_k, lda, locks, has_bias, part_use_atomic_add,
         use_fp32_reduce, max_shared_mem_new);
     // clang-format on
 
-    A_ptr += prob_m_split * (lda / 8);
+    bool is_a_8bit = a_type.size_bits() == 8;
+    A_ptr += prob_m_split * (lda / (is_a_8bit ? 16 : 8));
+    a_s_ptr += prob_m_split;
     C_ptr += prob_m_split * (prob_n / 8);
     rest_m -= prob_m_split;
   }
@@ -675,15 +522,73 @@ torch::Tensor gptq_marlin_gemm(
     torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
     torch::Tensor& b_q_weight,
     std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
+    std::optional<torch::Tensor> const& a_scales_or_none,
     std::optional<torch::Tensor> const& global_scale_or_none,
     std::optional<torch::Tensor> const& b_zeros_or_none,
     std::optional<torch::Tensor> const& g_idx_or_none,
     std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
-    vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n,
+    vllm::ScalarTypeId const& b_type_id, int64_t size_m, int64_t size_n,
     int64_t size_k, bool is_k_full, bool use_atomic_add, bool use_fp32_reduce,
     bool is_zp_float) {
-  vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
-  int pack_factor = 32 / b_q_type.size_bits();
+  vllm::ScalarTypeId a_type_id, c_type_id, s_type_id;
+
+  auto c_dtype = a.dtype();
+  if (a.scalar_type() == at::ScalarType::Half) {
+    a_type_id = vllm::kFloat16.id();
+    c_type_id = vllm::kFloat16.id();
+  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
+    a_type_id = vllm::kBFloat16.id();
+    c_type_id = vllm::kBFloat16.id();
+  } else {
+    c_dtype = b_scales.dtype();
+    if (b_scales.scalar_type() == at::ScalarType::Half) {
+      c_type_id = vllm::kFloat16.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::BFloat16) {
+      c_type_id = vllm::kBFloat16.id();
+    } else {
+      c_type_id = vllm::kBFloat16.id();
+
+      TORCH_CHECK(c_or_none.has_value(), "c must be passed for W4A8-FP4");
+      torch::Tensor c = c_or_none.value();
+      c_dtype = c.dtype();
+
+      if (c.scalar_type() == at::ScalarType::Half) {
+        c_type_id = vllm::kFloat16.id();
+      } else if (c.scalar_type() == at::ScalarType::BFloat16) {
+        c_type_id = vllm::kBFloat16.id();
+      } else {
+        TORCH_CHECK(false, "unsupported c dtype");
+      }
+    }
+
+    if (a.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      a_type_id = vllm::kFE4M3fn.id();
+    } else if (a.scalar_type() == at::ScalarType::Char) {
+      a_type_id = vllm::kS8.id();
+    } else {
+      TORCH_CHECK(false, "unsupported `a` scalar_type");
+    }
+  }
+
+  s_type_id = c_type_id;
+  if (b_type_id == vllm::kFE2M1f.id()) {
+    if (b_scales.scalar_type() == at::ScalarType::Float8_e4m3fn) {
+      s_type_id = vllm::kFE4M3fn.id();
+    } else if (b_scales.scalar_type() == at::ScalarType::Float8_e8m0fnu) {
+      s_type_id = vllm::kFE8M0fnu.id();
+    } else {
+      TORCH_CHECK(false,
+                  "When b_type = float4_e2m1f, b_scale scalar type must be",
+                  "float8_e4m3fn (for NVFP4) or float8_e8m0fnu (for MXFP4).");
+    }
+  }
+
+  vllm::ScalarType a_type = vllm::ScalarType::from_id(a_type_id);
+  vllm::ScalarType b_type = vllm::ScalarType::from_id(b_type_id);
+  vllm::ScalarType c_type = vllm::ScalarType::from_id(c_type_id);
+  vllm::ScalarType s_type = vllm::ScalarType::from_id(s_type_id);
+
+  int pack_factor = 32 / b_type.size_bits();
 
   // Verify A
   TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0),
@@ -721,6 +626,21 @@ torch::Tensor gptq_marlin_gemm(
   TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
   TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
 
+  torch::Tensor a_scales;
+  auto options = torch::TensorOptions().dtype(c_dtype).device(a.device());
+  auto options_fp32 =
+      torch::TensorOptions().dtype(at::kFloat).device(a.device());
+
+  if (a_scales_or_none.has_value()) {
+    a_scales = a_scales_or_none.value();
+    TORCH_CHECK(a_type.size_bits() == 8,
+                "a_scales can only be used for 8bit activation.");
+  } else {
+    a_scales = torch::empty({0}, options_fp32);
+    TORCH_CHECK(a_type.size_bits() != 8,
+                "the a_scales parameter must be passed for 8bit activation.");
+  }
+
   // thread_k: `k` size of a thread_tile in `weights` (can usually be left as
   // auto -1)
   int thread_k = -1;
@@ -733,7 +653,6 @@ torch::Tensor gptq_marlin_gemm(
 
   // Alloc buffers
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
-  auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   torch::Tensor c;
   if (c_or_none.has_value()) {
     c = c_or_none.value();
@@ -750,8 +669,6 @@ torch::Tensor gptq_marlin_gemm(
 
   // Alloc C tmp buffer that is going to be used for the global reduce
   torch::Tensor c_tmp;
-  auto options_fp32 =
-      torch::TensorOptions().dtype(at::kFloat).device(a.device());
   if (use_fp32_reduce) {
     int max_m_block_size = (size_m + 16 - 1) / 16 * 16;
     max_m_block_size = min(max_m_block_size, 64);
@@ -821,11 +738,11 @@ torch::Tensor gptq_marlin_gemm(
   torch::Tensor global_scale;
   if (global_scale_or_none.has_value()) {
     global_scale = global_scale_or_none.value();
-    TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
+    TORCH_CHECK(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn,
                 "global_scale can only be used for nvfp4 format.");
   } else {
     global_scale = torch::empty({0}, options);
-    TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
+    TORCH_CHECK(!(b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn),
                 "the global_scale parameter must be passed for nvfp4 format.");
   }
 
@@ -852,15 +769,15 @@ torch::Tensor gptq_marlin_gemm(
   bool has_zp = b_zeros.size(-1) > 0;
   if (has_zp) {
     TORCH_CHECK(
-        b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
-        "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str());
+        b_type == vllm::kU4 || b_type == vllm::kU8,
+        "b_type must be u4 or u8 when has_zp = True. Got = ", b_type.str());
   } else {
-    TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128 ||
-                    b_q_type == vllm::kFE4M3fn || b_q_type == vllm::kFE2M1f,
-                "b_q_type must be uint4b8, uint8b128, float8_e4m3fn or "
-                "float4_e2m1f when "
-                "has_zp = False. Got = ",
-                b_q_type.str());
+    TORCH_CHECK(b_type == vllm::kU4B8 || b_type == vllm::kU8B128 ||
+                    b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                    b_type == vllm::kFE4M3fn || b_type == vllm::kFE2M1f,
+                "b_type must be uint4b8, uint8b128, int4, int8, "
+                "float8_e4m3fn or float4_e2m1f when has_zp = False. Got = ",
+                b_type.str());
   }
 
   if (has_zp && is_zp_float) {
@@ -902,59 +819,27 @@ torch::Tensor gptq_marlin_gemm(
               " is below min_workspace_size = ", min_workspace_size);
 
   int dev = a.get_device();
-  if (a.scalar_type() == at::ScalarType::Half) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::Half>();
-    }
 
-    marlin::marlin_mm<half>(
-        a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
-        c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
-        global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
-        perm.data_ptr(), a_tmp.data_ptr<at::Half>(), size_m, size_n, size_k,
-        a.stride(0), workspace.data_ptr(), b_q_type, has_bias, has_act_order,
-        is_k_full, has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else if (a.scalar_type() == at::ScalarType::BFloat16) {
-    void* scales_ptr;
-    if (b_q_type == vllm::kFE2M1f) {
-      if (group_size == 16)
-        scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
-      else if (group_size == 32)
-        scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
-      else
-        TORCH_CHECK(false,
-                    "float4_e2m1f only supports group_size == 16 (NVFP4) ",
-                    "and group_size == 32 (MXFP4)");
-    } else {
-      scales_ptr = b_scales.data_ptr<at::BFloat16>();
-    }
-
-    marlin::marlin_mm<nv_bfloat16>(
-        a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
-        c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
-        b_bias.data_ptr<at::BFloat16>(), scales_ptr,
-        global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
-        g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
-        size_m, size_n, size_k, a.stride(0), workspace.data_ptr(), b_q_type,
-        has_bias, has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
-        at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
-        use_atomic_add, use_fp32_reduce, is_zp_float);
-  } else {
-    TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16");
+  TORCH_CHECK(a_scales.scalar_type() == at::ScalarType::Float,
+              "scalar type of a_scales must be float");
+  TORCH_CHECK(global_scale.scalar_type() == c.scalar_type(),
+              "scalar type of global_scale must be the same with c");
+  if (a_type.size_bits() == 16) {
+    TORCH_CHECK(
+        a.scalar_type() == c.scalar_type(),
+        "scalar type of a must be the same with c for 16 bit activation");
   }
 
+  marlin::marlin_mm(
+      a.data_ptr(), b_q_weight.data_ptr(), c.data_ptr(), c_tmp.data_ptr(),
+      b_bias.data_ptr(), a_scales.data_ptr(), b_scales.data_ptr(),
+      global_scale.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(),
+      perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, a.stride(0),
+      workspace.data_ptr(), a_type, b_type, c_type, s_type, has_bias,
+      has_act_order, is_k_full, has_zp, num_groups, group_size, dev,
+      at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
+      use_atomic_add, use_fp32_reduce, is_zp_float);
+
   return c;
 }
 
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
index 7c2d089a70d9..796e6c5359da 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin_repack.cu
@@ -4,15 +4,18 @@
 
 namespace marlin {
 
-template <int const num_threads, int const num_bits, bool const has_perm>
+template <int const num_threads, int const num_bits, bool const has_perm,
+          bool is_a_8bit>
 __global__ void gptq_marlin_repack_kernel(
     uint32_t const* __restrict__ b_q_weight_ptr,
     uint32_t const* __restrict__ perm_ptr, uint32_t* __restrict__ out_ptr,
     int size_k, int size_n) {
   constexpr int pack_factor = 32 / num_bits;
 
-  int k_tiles = size_k / tile_k_size;
-  int n_tiles = size_n / tile_n_size;
+  constexpr int target_tile_n_size = tile_n_size / (is_a_8bit ? 2 : 1);
+  constexpr int target_tile_k_size = tile_k_size * (is_a_8bit ? 2 : 1);
+  int k_tiles = size_k / target_tile_k_size;
+  int n_tiles = size_n / target_tile_n_size;
   int block_k_tiles = div_ceil(k_tiles, gridDim.x);
 
   auto start_k_tile = blockIdx.x * block_k_tiles;
@@ -34,7 +37,7 @@ __global__ void gptq_marlin_repack_kernel(
 
   extern __shared__ int4 sh[];
 
-  constexpr int perm_size = tile_k_size / 4;
+  constexpr int perm_size = target_tile_k_size / 4;
 
   int4* sh_perm_ptr = sh;
   int4* sh_pipe_ptr = sh_perm_ptr;
@@ -42,14 +45,14 @@ __global__ void gptq_marlin_repack_kernel(
     sh_pipe_ptr += perm_size;
   }
 
-  constexpr int tile_ints = tile_k_size / pack_factor;
+  constexpr int tile_ints = target_tile_k_size / pack_factor;
 
-  constexpr int stage_n_threads = tile_n_size / 4;
-  constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
+  constexpr int stage_n_threads = target_tile_n_size / 4;
+  constexpr int stage_k_threads = has_perm ? target_tile_k_size : tile_ints;
   constexpr int stage_size = stage_k_threads * stage_n_threads;
 
   auto load_perm_to_shared = [&](int k_tile_id) {
-    int first_k_int4 = (k_tile_id * tile_k_size) / 4;
+    int first_k_int4 = (k_tile_id * target_tile_k_size) / 4;
 
     int4 const* perm_int4_ptr = reinterpret_cast<int4 const*>(perm_ptr);
 
@@ -65,7 +68,7 @@ __global__ void gptq_marlin_repack_kernel(
       return;
     }
 
-    int first_n = n_tile_id * tile_n_size;
+    int first_n = n_tile_id * target_tile_n_size;
 
     int4* sh_ptr = sh_pipe_ptr + stage_size * pipe;
 
@@ -91,7 +94,7 @@ __global__ void gptq_marlin_repack_kernel(
         auto k_id = threadIdx.x / stage_n_threads;
         auto n_id = threadIdx.x % stage_n_threads;
 
-        int first_k = k_tile_id * tile_k_size;
+        int first_k = k_tile_id * target_tile_k_size;
         int first_k_packed = first_k / pack_factor;
 
         cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
@@ -117,13 +120,13 @@ __global__ void gptq_marlin_repack_kernel(
     }
 
     int tc_col = th_id / 4;
-    int tc_row = (th_id % 4) * 2;
+    int tc_row = (th_id % 4) * (is_a_8bit ? 4 : 2);
 
     constexpr int tc_offsets[4] = {0, 1, 8, 9};
 
-    int cur_n = warp_id * 16 + tc_col;
+    int cur_n = (warp_id / (is_a_8bit ? 2 : 1)) * 16 + tc_col;
 
-    constexpr int sh_stride = 64;
+    constexpr int sh_stride = target_tile_n_size;
     constexpr uint32_t mask = (1 << num_bits) - 1;
 
     int4* sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
@@ -134,6 +137,7 @@ __global__ void gptq_marlin_repack_kernel(
     uint32_t vals[8];
 
     if constexpr (has_perm) {
+      static_assert(!is_a_8bit);
       for (int i = 0; i < 4; i++) {
         int k_idx = tc_row + tc_offsets[i];
 
@@ -156,28 +160,49 @@ __global__ void gptq_marlin_repack_kernel(
 
 #pragma unroll
       for (int i = 0; i < tile_ints; i++) {
-        b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
-        b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+        if constexpr (is_a_8bit) {
+          b1_vals[i] =
+              sh_stage_int_ptr[cur_n + sh_stride * i + (warp_id % 2) * 8];
+        } else {
+          b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
+          b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
+        }
       }
 
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        int cur_elem = tc_row + tc_offsets[i];
+        int cur_elem = tc_row + (is_a_8bit ? i : tc_offsets[i]);
         int cur_int = cur_elem / pack_factor;
         int cur_pos = cur_elem % pack_factor;
 
         vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
-        vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
+        if constexpr (is_a_8bit)
+          vals[4 + i] =
+              (b1_vals[cur_int + tile_ints / 2] >> (cur_pos * num_bits)) & mask;
+        else
+          vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
       }
     }
 
-    constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
+    constexpr int tile_size =
+        target_tile_k_size * target_tile_n_size / pack_factor;
     int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
 
     // Result of:
     // https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
-    if constexpr (num_bits == 4) {
-      constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    if constexpr (!is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+
+      uint32_t res = 0;
+#pragma unroll
+      for (int i = 0; i < 8; i++) {
+        res |= vals[pack_idx[i]] << (i * 4);
+      }
+
+      out_ptr[out_offset + th_id * 4 + warp_id] = res;
+
+    } else if constexpr (is_a_8bit && num_bits == 4) {
+      int pack_idx[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 
       uint32_t res = 0;
 #pragma unroll
@@ -194,8 +219,9 @@ __global__ void gptq_marlin_repack_kernel(
       uint32_t res2 = 0;
 #pragma unroll
       for (int i = 0; i < 4; i++) {
-        res1 |= vals[pack_idx[i]] << (i * 8);
-        res2 |= vals[4 + pack_idx[i]] << (i * 8);
+        const int ii = is_a_8bit ? i : pack_idx[i];
+        res1 |= vals[ii] << (i * 8);
+        res2 |= vals[4 + ii] << (i * 8);
       }
 
       out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
@@ -236,21 +262,22 @@ __global__ void gptq_marlin_repack_kernel(
 
 }  // namespace marlin
 
-#define CALL_IF(NUM_BITS, HAS_PERM)                                         \
-  else if (num_bits == NUM_BITS && has_perm == HAS_PERM) {                  \
+#define CALL_IF(NUM_BITS, HAS_PERM, IS_A_8BIT)                              \
+  else if (num_bits == NUM_BITS && has_perm == HAS_PERM &&                  \
+           is_a_8bit == IS_A_8BIT) {                                        \
     cudaFuncSetAttribute(                                                   \
         marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS, \
-                                          HAS_PERM>,                        \
+                                          HAS_PERM, IS_A_8BIT>,             \
         cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem);       \
     marlin::gptq_marlin_repack_kernel<marlin::repack_threads, NUM_BITS,     \
-                                      HAS_PERM>                             \
+                                      HAS_PERM, IS_A_8BIT>                  \
         <<<blocks, marlin::repack_threads, max_shared_mem, stream>>>(       \
             b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n);             \
   }
 
 torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
-                                 int64_t num_bits) {
+                                 int64_t num_bits, bool is_a_8bit) {
   // Verify compatibility with marlin tile of 16x64
   TORCH_CHECK(size_k % marlin::tile_k_size == 0, "size_k = ", size_k,
               " is not divisible by tile_k_size = ", marlin::tile_k_size);
@@ -309,34 +336,22 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
 
   if (false) {
   }
-  CALL_IF(4, false)
-  CALL_IF(4, true)
-  CALL_IF(8, false)
-  CALL_IF(8, true)
+  CALL_IF(4, false, false)
+  CALL_IF(4, true, false)
+  CALL_IF(8, false, false)
+  CALL_IF(8, true, false)
+
+  CALL_IF(4, false, true)
+  CALL_IF(8, false, true)
+
   else {
     TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
-                ", has_perm = ", has_perm);
+                ", has_perm = ", has_perm, ", is_a_8bit = ", is_a_8bit);
   }
 
   return out;
 }
 
-torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
-                                      torch::Tensor& perm, c10::SymInt size_k,
-                                      c10::SymInt size_n, int64_t num_bits) {
-  int const pack_factor = 32 / num_bits;
-  auto options = torch::TensorOptions()
-                     .dtype(b_q_weight.dtype())
-                     .device(b_q_weight.device());
-  return torch::empty_symint(
-      {size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
-      options);
-}
-
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("gptq_marlin_repack", &gptq_marlin_repack);
 }
-
-TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, Meta, m) {
-  m.impl("gptq_marlin_repack", &gptq_marlin_repack_meta);
-}
diff --git a/csrc/quantization/gptq_marlin/kernel.h b/csrc/quantization/gptq_marlin/kernel.h
index bb454f6aff22..b3b79c8aec45 100644
--- a/csrc/quantization/gptq_marlin/kernel.h
+++ b/csrc/quantization/gptq_marlin/kernel.h
@@ -11,17 +11,19 @@
   const int4 *__restrict__ A, const int4 *__restrict__ B,                      \
       int4 *__restrict__ C, int4 *__restrict__ C_tmp,                          \
       const int4 *__restrict__ b_bias_ptr,                                     \
+      const float *__restrict__ a_scales_ptr,                                  \
       const int4 *__restrict__ scales_ptr,                                     \
-      const uint16_t *__restrict__ scale2_ptr,                                 \
+      const uint16_t *__restrict__ global_scale_ptr,                           \
       const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx,          \
       int num_groups, int prob_m, int prob_n, int prob_k, int lda, int *locks, \
       bool has_bias, bool use_atomic_add, bool use_fp32_reduce,                \
       int max_shared_mem
 
 namespace MARLIN_NAMESPACE_NAME {
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
diff --git a/csrc/quantization/gptq_marlin/marlin.cuh b/csrc/quantization/gptq_marlin/marlin.cuh
index f3b44641e77e..2505e221322d 100644
--- a/csrc/quantization/gptq_marlin/marlin.cuh
+++ b/csrc/quantization/gptq_marlin/marlin.cuh
@@ -55,6 +55,45 @@ constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
 // No support for async
 #else
 
+__device__ inline void cp_async1_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 4;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async2_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 8;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
+__device__ inline void cp_async4_ca_pred(void* smem_ptr, const void* glob_ptr,
+                                         bool pred = true) {
+  const int BYTES = 16;
+  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
+  asm volatile(
+      "{\n"
+      "   .reg .pred p;\n"
+      "   setp.ne.b32 p, %0, 0;\n"
+      "   @p cp.async.ca.shared.global [%1], [%2], %3;\n"
+      "}\n" ::"r"((int)pred),
+      "r"(smem), "l"(glob_ptr), "n"(BYTES));
+}
+
 __device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr,
                                       bool pred = true) {
   const int BYTES = 16;
diff --git a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
index cc1605481434..a4807a6887f8 100644
--- a/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
+++ b/csrc/quantization/gptq_marlin/marlin_dtypes.cuh
@@ -2,8 +2,10 @@
 #ifndef _data_types_cuh
 #define _data_types_cuh
 #include "marlin.cuh"
+#include "core/scalar_type.hpp"
 #include <cuda_fp16.h>
 #include <cuda_bf16.h>
+#include <cuda_fp8.h>
 
 #ifndef MARLIN_NAMESPACE_NAME
   #define MARLIN_NAMESPACE_NAME marlin
@@ -11,14 +13,16 @@
 
 namespace MARLIN_NAMESPACE_NAME {
 
-template <typename scalar_t>
-class ScalarType {};
+template <long scalar_type_id>
+class MarlinScalarType {};
 
 template <>
-class ScalarType<half> {
+class MarlinScalarType<vllm::kFloat16.id()> {
  public:
   using scalar_t = half;
   using scalar_t2 = half2;
+  using scalar_t4 = half2;
+  using scalar_32bit_t = half2;
 
   // Matrix fragments for tensor core instructions; their precise layout is
   // documented here:
@@ -27,6 +31,7 @@ class ScalarType<half> {
   using FragB = Vec<half2, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<half2, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
   using FragZP = Vec<half2, 4>;
 
   static __device__ float inline num2float(const half x) {
@@ -44,18 +49,25 @@ class ScalarType<half> {
   static __host__ __device__ half inline float2num(const float x) {
     return __float2half(x);
   }
+
+  static __host__ __device__ float2 inline num22float2(const half2 x) {
+    return __half22float2(x);
+  }
 };
 
 template <>
-class ScalarType<nv_bfloat16> {
+class MarlinScalarType<vllm::kBFloat16.id()> {
  public:
   using scalar_t = nv_bfloat16;
   using scalar_t2 = nv_bfloat162;
+  using scalar_t4 = nv_bfloat162;
+  using scalar_32bit_t = nv_bfloat162;
 
   using FragA = Vec<nv_bfloat162, 4>;
   using FragB = Vec<nv_bfloat162, 2>;
   using FragC = Vec<float, 4>;
   using FragS = Vec<nv_bfloat162, 1>;
+  using FragS0 = Vec<__nv_fp8x2_e4m3, 1>;
   using FragZP = Vec<nv_bfloat162, 4>;
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
@@ -75,9 +87,63 @@ class ScalarType<nv_bfloat16> {
   static __host__ __device__ nv_bfloat16 inline float2num(const float x) {
     return __float2bfloat16(x);
   }
+
+  static __host__ __device__ float2 inline num22float2(const nv_bfloat162 x) {
+    return __bfloat1622float2(x);
+  }
 #endif
 };
 
+template <>
+class MarlinScalarType<vllm::kFE4M3fn.id()> {
+ public:
+  using scalar_t = __nv_fp8_e4m3;
+  using scalar_t2 = __nv_fp8x2_e4m3;
+  using scalar_t4 = __nv_fp8x4_e4m3;
+  using scalar_32bit_t = __nv_fp8x4_e4m3;
+
+  using FragA = Vec<__nv_fp8x4_e4m3, 4>;
+  using FragB = Vec<__nv_fp8x4_e4m3, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<__nv_fp8x2_e4m3, 4>;
+
+  static __host__ __device__
+      float2 inline num22float2(const __nv_fp8x2_e4m3 x) {
+    return (float2)x;
+  }
+};
+
+template <>
+class MarlinScalarType<vllm::kS8.id()> {
+ public:
+  using scalar_t = int8_t;
+  using scalar_t2 = int16_t;
+  using scalar_t4 = int32_t;
+  using scalar_32bit_t = int32_t;
+
+  using FragA = Vec<int32_t, 4>;
+  using FragB = Vec<int32_t, 2>;
+  using FragC = Vec<float, 4>;
+  using FragZP = Vec<int16_t, 4>;
+};
+
+template <typename scalar_t>
+class MarlinScalarType2 {};
+
+template <>
+class MarlinScalarType2<half> : public MarlinScalarType<vllm::kFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<nv_bfloat16>
+    : public MarlinScalarType<vllm::kBFloat16.id()> {};
+
+template <>
+class MarlinScalarType2<__nv_fp8_e4m3>
+    : public MarlinScalarType<vllm::kFE4M3fn.id()> {};
+
+template <>
+class MarlinScalarType2<int8_t> : public MarlinScalarType<vllm::kS8.id()> {};
+
 }  // namespace MARLIN_NAMESPACE_NAME
 
 #endif
diff --git a/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
new file mode 100644
index 000000000000..7d4c97fb57ed
--- /dev/null
+++ b/csrc/quantization/gptq_marlin/marlin_int4_fp8_preprocess.cu
@@ -0,0 +1,106 @@
+
+
+#include "marlin.cuh"
+
+#include "core/registration.h"
+
+// for only non-zp format (like gptq)
+__global__ void marlin_int4_fp8_preprocess_kernel_without_zp(
+    // qweight: (size_k * size_n // 8,)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output) {
+  int32_t val = qweight[blockIdx.x * 32 + threadIdx.x];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    single_val = single_val >= 8 ? single_val - 8 : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+  }
+
+  output[blockIdx.x * 32 + threadIdx.x] = new_val;
+}
+
+// for awq format only (with zp and with awq weight layout)
+__global__ void marlin_int4_fp8_preprocess_kernel_awq(
+    // AWQ qweight: (size_k, size_n // 8)
+    const int32_t* __restrict__ qweight,
+    // output: same shape with qweight
+    int32_t* __restrict__ output,
+    // AWQ zeros: (size_k // group_size, size_n // 8)
+    const int32_t* __restrict__ qzeros, int32_t size_n, int32_t size_k,
+    int32_t group_size) {
+  int32_t val =
+      qweight[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y];
+  int32_t zero =
+      qzeros[(blockIdx.x * 32 + threadIdx.x) / group_size * size_n / 8 +
+             blockIdx.y];
+  int32_t new_val = 0;
+
+#pragma unroll
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t single_val = val & 0xF;
+    int32_t single_zero = zero & 0xF;
+
+    single_val =
+        single_val >= single_zero ? single_val - single_zero : 15 - single_val;
+    new_val |= single_val << (i * 4);
+    val >>= 4;
+    zero >>= 4;
+  }
+
+  output[(blockIdx.x * 32 + threadIdx.x) * size_n / 8 + blockIdx.y] = new_val;
+}
+
+torch::Tensor marlin_int4_fp8_preprocess(
+    torch::Tensor& qweight, std::optional<torch::Tensor> qzeros_or_none,
+    bool inplace) {
+  TORCH_CHECK(qweight.device().is_cuda(), "qweight is not on GPU");
+  TORCH_CHECK(qweight.scalar_type() == at::ScalarType::Int,
+              "qweight.dtype != torch.int32");
+
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(qweight));
+
+  torch::Tensor output = inplace ? qweight : torch::empty_like(qweight);
+
+  if (!qzeros_or_none.has_value()) {
+    TORCH_CHECK(qweight.numel() * 8 % 256 == 0,
+                "qweight.numel() * 8 % 256 != 0");
+
+    int blocks = qweight.numel() * 8 / 256;
+    marlin_int4_fp8_preprocess_kernel_without_zp<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr());
+  } else {
+    int32_t size_k = qweight.size(0);
+    int32_t size_n = qweight.size(1) * 8;
+    torch::Tensor qzeros = qzeros_or_none.value();
+
+    TORCH_CHECK(size_k % 32 == 0, "size_k % 32 != 0");
+    TORCH_CHECK(qzeros.device().is_cuda(), "qzeros is not on GPU");
+    TORCH_CHECK(qzeros.scalar_type() == at::ScalarType::Int,
+                "qweight.dtype != torch.int32");
+    TORCH_CHECK(device_of(qweight) == device_of(qzeros),
+                "qzeros is not on the same device with qweight");
+
+    int32_t group_size = qweight.size(0) / qzeros.size(0);
+    TORCH_CHECK(qweight.size(1) == qzeros.size(1),
+                "qweight.size(1) != qzeros.size(1)");
+    TORCH_CHECK(qweight.size(0) % qzeros.size(0) == 0,
+                "qweight.size(0) % qzeros.size(0) != 0");
+    TORCH_CHECK(group_size % 8 == 0, "group_size % 8 != 0");
+
+    dim3 blocks(size_k / 32, size_n / 8);
+    marlin_int4_fp8_preprocess_kernel_awq<<<blocks, 32>>>(
+        (const int32_t*)qweight.data_ptr(), (int32_t*)output.data_ptr(),
+        (const int32_t*)qzeros.data_ptr(), size_n, size_k, group_size);
+  }
+
+  return output;
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("marlin_int4_fp8_preprocess", &marlin_int4_fp8_preprocess);
+}
diff --git a/csrc/quantization/gptq_marlin/marlin_template.h b/csrc/quantization/gptq_marlin/marlin_template.h
index bfb0a3668f52..22bb71e482ce 100644
--- a/csrc/quantization/gptq_marlin/marlin_template.h
+++ b/csrc/quantization/gptq_marlin/marlin_template.h
@@ -38,7 +38,7 @@ namespace MARLIN_NAMESPACE_NAME {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
 
 template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // weight MarlinScalarType id
           const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
@@ -77,65 +77,139 @@ __global__ void Marlin(
 
 // m16n8k16 tensor core mma instruction with fp16 inputs and fp32
 // output/accumulation.
-template <typename scalar_t>
-__device__ inline void mma(const typename ScalarType<scalar_t>::FragA& a_frag,
-                           const typename ScalarType<scalar_t>::FragB& frag_b,
-                           typename ScalarType<scalar_t>::FragC& frag_c) {
+template <vllm::ScalarTypeId type_id, int k_size = 16>
+__device__ inline void mma(
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragC& frag_c, int idx = 0) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
-  float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "f"(c[0]),
+            "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[idx * 2]), "r"(a[idx * 2 + 1]), "r"(b[idx]), "r"(c[0]),
+            "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
+  } else if (k_size == 32) {
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]), "r"(b[0]), "r"(b[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id, int k_size = 16>
 __device__ inline void mma_trans(
-    const typename ScalarType<scalar_t>::FragA& a_frag,
-    const typename ScalarType<scalar_t>::FragB& frag_b,
-    const typename ScalarType<scalar_t>::FragB& frag_b2,
-    typename ScalarType<scalar_t>::FragC& frag_c) {
+    const typename MarlinScalarType<type_id>::FragA& a_frag,
+    const typename MarlinScalarType<type_id>::FragB& frag_b,
+    const typename MarlinScalarType<type_id>::FragB& frag_b2,
+    typename MarlinScalarType<type_id>::FragC& frag_c) {
   const uint32_t* a = reinterpret_cast<const uint32_t*>(&a_frag);
   const uint32_t* b = reinterpret_cast<const uint32_t*>(&frag_b);
   const uint32_t* b2 = reinterpret_cast<const uint32_t*>(&frag_b2);
   float* c = reinterpret_cast<float*>(&frag_c);
-  if constexpr (std::is_same<scalar_t, half>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
-  } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
-    asm volatile(
-        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
-        "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
-        : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
-        : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
-          "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  if constexpr (k_size == 16) {
+    if constexpr (std::is_same<scalar_t, half>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "f"(c[0]), "f"(c[1]), "f"(c[2]),
+            "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(a[0]), "r"(c[0]), "r"(c[1]), "r"(c[2]),
+            "r"(c[3]));
+    }
   } else {
-    STATIC_ASSERT_SCALAR_TYPE_VALID(scalar_t);
+    if constexpr (std::is_same<scalar_t, __nv_fp8_e4m3>::value) {
+      float* c = reinterpret_cast<float*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.f32.e4m3.e4m3.f32 "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=f"(c[0]), "=f"(c[1]), "=f"(c[2]), "=f"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "f"(c[0]), "f"(c[1]), "f"(c[2]), "f"(c[3]));
+    } else if constexpr (std::is_same<scalar_t, int8_t>::value) {
+      int32_t* c = reinterpret_cast<int32_t*>(&frag_c);
+      asm volatile(
+          "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite "
+          "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n"
+          : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+          : "r"(b[0]), "r"(b2[0]), "r"(b[1]), "r"(b2[1]), "r"(a[0]), "r"(a[1]),
+            "r"(c[0]), "r"(c[1]), "r"(c[2]), "r"(c[3]));
+    }
   }
 }
 
 // Instruction for loading a full 16x16 matrix fragment of operand A from shared
 // memory, directly in tensor core layout.
-template <int count, typename scalar_t>
-__device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
+template <int count, vllm::ScalarTypeId type_id>
+__device__ inline void ldsm(typename MarlinScalarType<type_id>::FragA& frag_a,
                             const void* smem_ptr) {
   uint32_t* a = reinterpret_cast<uint32_t*>(&frag_a);
   uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
@@ -159,47 +233,54 @@ __device__ inline void ldsm(typename ScalarType<scalar_t>::FragA& frag_a,
 
 // Multiply dequantized values by the corresponding quantization scale; used
 // only for grouped quantization.
-template <typename scalar_t>
-__device__ inline void scale(typename ScalarType<scalar_t>::FragB& frag_b,
-                             typename ScalarType<scalar_t>::FragS& frag_s,
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale(typename MarlinScalarType<type_id>::FragB& frag_b,
+                             typename MarlinScalarType<type_id>::FragS& frag_s,
                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_s)[i]);
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_s)[i]);
   frag_b[0] = __hmul2(frag_b[0], s);
   frag_b[1] = __hmul2(frag_b[1], s);
 }
 
-template <typename scalar_t>
+template <vllm::ScalarTypeId type_id>
 __device__ inline void scale_and_sub(
-    typename ScalarType<scalar_t>::FragB& frag_b, scalar_t s, scalar_t zp) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 s2 = ScalarType<scalar_t>::num2num2(s);
-  scalar_t2 zp2 = ScalarType<scalar_t>::num2num2(zp);
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t s,
+    typename MarlinScalarType<type_id>::scalar_t zp) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 s2 = MarlinScalarType<type_id>::num2num2(s);
+  scalar_t2 zp2 = MarlinScalarType<type_id>::num2num2(zp);
   frag_b[0] = __hfma2(frag_b[0], s2, __hneg2(zp2));
   frag_b[1] = __hfma2(frag_b[1], s2, __hneg2(zp2));
 }
 
-template <typename scalar_t>
-__device__ inline void sub_zp(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::scalar_t2& frag_zp,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  scalar_t2 zp =
-      ScalarType<scalar_t>::num2num2(reinterpret_cast<scalar_t*>(&frag_zp)[i]);
+template <vllm::ScalarTypeId type_id>
+__device__ inline void sub_zp(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::scalar_t2& frag_zp, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+  scalar_t2 zp = MarlinScalarType<type_id>::num2num2(
+      reinterpret_cast<scalar_t*>(&frag_zp)[i]);
   frag_b[0] = __hsub2(frag_b[0], zp);
   frag_b[1] = __hsub2(frag_b[1], zp);
 }
 
 // Same as above, but for act_order (each K is multiplied individually)
-template <typename scalar_t>
-__device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
-                              typename ScalarType<scalar_t>::FragS& frag_s_1,
-                              typename ScalarType<scalar_t>::FragS& frag_s_2,
-                              typename ScalarType<scalar_t>::FragS& frag_s_3,
-                              typename ScalarType<scalar_t>::FragS& frag_s_4,
-                              int i) {
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale4(
+    typename MarlinScalarType<type_id>::FragB& frag_b,
+    typename MarlinScalarType<type_id>::FragS& frag_s_1,
+    typename MarlinScalarType<type_id>::FragS& frag_s_2,
+    typename MarlinScalarType<type_id>::FragS& frag_s_3,
+    typename MarlinScalarType<type_id>::FragS& frag_s_4, int i) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<type_id>::scalar_t2;
+
   scalar_t2 s_val_1_2;
   s_val_1_2.x = reinterpret_cast<scalar_t*>(&frag_s_1)[i];
   s_val_1_2.y = reinterpret_cast<scalar_t*>(&frag_s_2)[i];
@@ -213,12 +294,13 @@ __device__ inline void scale4(typename ScalarType<scalar_t>::FragB& frag_b,
 }
 
 // Given 2 floats multiply by 2 scales (halves)
-template <typename scalar_t>
-__device__ inline void scale_float(float* c,
-                                   typename ScalarType<scalar_t>::FragS& s) {
+template <vllm::ScalarTypeId type_id>
+__device__ inline void scale_float(
+    float* c, typename MarlinScalarType<type_id>::FragS& s) {
+  using scalar_t = typename MarlinScalarType<type_id>::scalar_t;
   scalar_t* s_ptr = reinterpret_cast<scalar_t*>(&s);
-  c[0] = __fmul_rn(c[0], ScalarType<scalar_t>::num2float(s_ptr[0]));
-  c[1] = __fmul_rn(c[1], ScalarType<scalar_t>::num2float(s_ptr[1]));
+  c[0] = __fmul_rn(c[0], MarlinScalarType<type_id>::num2float(s_ptr[0]));
+  c[1] = __fmul_rn(c[1], MarlinScalarType<type_id>::num2float(s_ptr[1]));
 }
 
 // Wait until barrier reaches `count`, then lock for current threadblock.
@@ -270,9 +352,10 @@ __device__ inline void wait_negative_and_add(int* lock) {
   __syncthreads();
 }
 
-template <typename scalar_t,  // compute dtype, half or nv_float16
-          const vllm::ScalarTypeId w_type_id,  // weight ScalarType id
-          const vllm::ScalarTypeId s_type_id,  // weight scale ScalarType id
+template <const vllm::ScalarTypeId a_type_id,  // A ScalarType id
+          const vllm::ScalarTypeId b_type_id,  // B ScalarType id
+          const vllm::ScalarTypeId c_type_id,  // C ScalarType id
+          const vllm::ScalarTypeId s_type_id,  // B_SCALE ScalarType id
           const int threads,          // number of threads in a threadblock
           const int thread_m_blocks,  // number of 16x16 blocks in the m
                                       // dimension (batchsize) of the
@@ -288,18 +371,23 @@ template <typename scalar_t,  // compute dtype, half or nv_float16
           const bool is_zp_float   // is zero point of float16 type?
           >
 __global__ void Marlin(
-    const int4* __restrict__ A,  // fp16 input matrix of shape mxk
-    const int4* __restrict__ B,  // 4bit quantized weight matrix of shape kxn
-    int4* __restrict__ C,        // fp16 output buffer of shape mxn
-    int4* __restrict__ C_tmp,    // fp32 tmp output buffer (for reduce)
+    const int4* __restrict__ A0,  // fp16 input matrix of shape mxk
+    const int4* __restrict__ B,   // 4bit quantized weight matrix of shape kxn
+    int4* __restrict__ C0,        // fp16 output buffer of shape mxn
+    int4* __restrict__ C_tmp,     // fp32 tmp output buffer (for reduce)
     const int4* __restrict__ b_bias_ptr,
-    const int4* __restrict__ scales_ptr,  // fp16 quantization scales of shape
-                                          // (k/groupsize)xn
-    const uint16_t* __restrict__ scale2_ptr,  // fp16 global scale (for nvfp4
-                                              // only)
-    const int4* __restrict__ zp_ptr,  // 4bit packed zero-points of shape
-                                      // (k/groupsize)x(n/pack_factor)
-    const int* __restrict__ g_idx,    // int32 group indices of shape k
+    // float scales of input matrix, only used when is_a_8bit == true.
+    // shape (m,)
+    const float* __restrict__ a_scales_ptr,
+    // fp16 quantization scales. shape (k/groupsize, n)
+    const int4* __restrict__ scales_ptr,
+    // fp16 global scale (for nvfp4// only)
+    const uint16_t* __restrict__ global_scale_ptr,
+    // 4bit packed zero-points of shape
+    // (k/groupsize, n/pack_factor)
+    const int4* __restrict__ zp_ptr,
+    // int32 group indices of shape k
+    const int* __restrict__ g_idx,
     int num_groups,  // number of scale groups per output channel
     int prob_m,      // batch dimension m
     int prob_n,      // output dimension n
@@ -321,17 +409,35 @@ __global__ void Marlin(
   // ensures good utilization of all SMs for many kinds of shape and GPU
   // configurations, while requiring as few slow global cross-threadblock
   // reductions as possible.
-  using Dtype = ScalarType<scalar_t>;
-  using scalar_t2 = typename ScalarType<scalar_t>::scalar_t2;
-  using FragA = typename ScalarType<scalar_t>::FragA;
-  using FragB = typename ScalarType<scalar_t>::FragB;
-  using FragC = typename ScalarType<scalar_t>::FragC;
-  using FragS = typename ScalarType<scalar_t>::FragS;
-  using FragZP = typename ScalarType<scalar_t>::FragZP;
-
-  static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 890
+  // FP8 computation is only supported for Ada Lovelace or newer architectures.
+  if constexpr (a_type_id == vllm::kFE4M3fn.id()) return;
+  #endif
+
+  using Adtype = MarlinScalarType<a_type_id>;
+  using Cdtype = MarlinScalarType<c_type_id>;
+  const int4* A = A0;
+  int4* C = C0;
+
+  using scalar_t = typename MarlinScalarType<a_type_id>::scalar_t;
+  using scalar_t2 = typename MarlinScalarType<a_type_id>::scalar_t2;
+  using scalar_32bit_t = typename MarlinScalarType<a_type_id>::scalar_32bit_t;
+
+  using c_scalar_t = typename MarlinScalarType<c_type_id>::scalar_t;
+  using c_scalar_t2 = typename MarlinScalarType<c_type_id>::scalar_t2;
+
+  using FragA = typename MarlinScalarType<a_type_id>::FragA;
+  using FragB = typename MarlinScalarType<a_type_id>::FragB;
+  using FragC = typename MarlinScalarType<a_type_id>::FragC;
+  using FragS = typename MarlinScalarType<c_type_id>::FragS;
+  using FragZP = typename MarlinScalarType<c_type_id>::FragZP;
+
+  static constexpr auto a_type = vllm::ScalarType::from_id(a_type_id);
+  static constexpr auto b_type = vllm::ScalarType::from_id(b_type_id);
+  static constexpr auto c_type = vllm::ScalarType::from_id(c_type_id);
   static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
-  if constexpr (w_type == vllm::kFE2M1f) {
+  if constexpr (b_type == vllm::kFE2M1f) {
     static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
                   s_type == vllm::kFE8M0fnu && group_blocks == 2);
   } else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
@@ -340,27 +446,35 @@ __global__ void Marlin(
     static_assert(s_type == vllm::kFloat16);
   }
 
-  constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
-  constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
-                               w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
+  constexpr bool is_a_8bit = a_type.size_bits() == 8;
+  if constexpr (!is_a_8bit) {
+    static_assert(std::is_same<scalar_t, c_scalar_t>::value);
+  }
+  constexpr bool has_zp = b_type == vllm::kU4 || b_type == vllm::kU8;
+  constexpr bool is_int_type = b_type == vllm::kU4 || b_type == vllm::kU8 ||
+                               b_type == vllm::kS4 || b_type == vllm::kS8 ||
+                               b_type == vllm::kU4B8 || b_type == vllm::kU8B128;
   // see comments of dequant.h for more details
   constexpr bool dequant_skip_flop =
-      w_type == vllm::kFE4M3fn ||
-      w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
+      is_a_8bit || b_type == vllm::kFE4M3fn ||
+      b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
       has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
-      has_zp && !is_zp_float && !(w_type == vllm::kU8);
+      has_zp && !is_zp_float && !(b_type == vllm::kU8);
+
+  c_scalar_t2 global_scale;
 
-  scalar_t2 global_scale;
-  if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
-    // NVFP4 format requires global scale
-    uint16_t val = scale2_ptr[0];
-    global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
+  if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+    uint16_t val = global_scale_ptr[0];
+    global_scale = Cdtype::num2num2(*reinterpret_cast<c_scalar_t*>(&val));
   }
 
   constexpr bool has_act_order = group_blocks == 0;
   constexpr int m_block_size = m_block_size_8 ? 8 : (16 * thread_m_blocks);
 
-  constexpr int pack_factor = 32 / w_type.size_bits();
+  extern __shared__ int4 sh[];
+  float* sh_a_s = reinterpret_cast<float*>(sh);
+  int4* sh_new = sh + (is_a_8bit ? (4 * thread_m_blocks) : 0);
+  constexpr int pack_factor = 32 / b_type.size_bits();
   static_assert(thread_m_blocks == 1 || !m_block_size_8);
 
   // For larger GEMMs we run multiple batchsize 64 versions in parallel for a
@@ -373,7 +487,19 @@ __global__ void Marlin(
 
   int k_tiles = prob_k / 16 / thread_k_blocks;
   int n_tiles = prob_n / 16 / thread_n_blocks;
-  int iters = div_ceil(k_tiles * n_tiles * parallel, gridDim.x);
+
+  int global_mn_tiles = parallel * n_tiles;
+  int part2_mn_tiles = global_mn_tiles;
+  int part1_mn_iters = 0;
+  bool in_part2 = false;
+
+  if (global_mn_tiles > gridDim.x) {
+    part2_mn_tiles = global_mn_tiles % gridDim.x;
+    if (part2_mn_tiles * 3 <= gridDim.x) part2_mn_tiles += gridDim.x;
+    part1_mn_iters = (global_mn_tiles - part2_mn_tiles) / gridDim.x;
+  }
+
+  int iters = div_ceil(k_tiles * part2_mn_tiles, gridDim.x);
 
   if constexpr (!has_act_order && group_blocks != -1) {
     if (group_blocks >= thread_k_blocks) {
@@ -385,28 +511,21 @@ __global__ void Marlin(
     }
   }
 
-  int slice_row = (iters * blockIdx.x) % k_tiles;
-  int slice_col_par = (iters * blockIdx.x) / k_tiles;
-  int slice_col = slice_col_par;
-  int slice_iters;  // number of threadblock tiles in the current slice
-  int slice_count =
-      0;          // total number of active threadblocks in the current slice
-  int slice_idx;  // index of threadblock in current slice; numbered bottom to
-                  // top
+  int slice_row = 0;
+  int slice_col_par = blockIdx.x;
+  int slice_col;
+  int slice_iters =
+      k_tiles;  // number of threadblock tiles in the current slice
+  // total number of active threadblocks in the current slice
+  int slice_count = 1;
+  // index of threadblock in current slice; numbered bottom to top
+  int slice_idx = 0;
 
   int par_id = 0;
   int locks_off = 0;
 
-  // We can easily implement parallel problem execution by just remapping
-  // indices and advancing global pointers
-  if (slice_col_par >= n_tiles) {
-    A += (slice_col_par / n_tiles) * 16 * thread_m_blocks * lda / 8;
-    C += (slice_col_par / n_tiles) * 16 * thread_m_blocks * prob_n / 8;
-    slice_col = slice_col_par % n_tiles;
-    par_id = slice_col_par / n_tiles;
-  }
-  if (parallel * n_tiles >= gridDim.x) {
-    // when parallel * n_tiles >= sms
+  if (part2_mn_tiles >= gridDim.x) {
+    // when part2_mn_tiles >= sms
     // then there are at most $sms$ conflict tile blocks
     locks_off = blockIdx.x;
   } else {
@@ -415,10 +534,11 @@ __global__ void Marlin(
 
   // Compute all information about the current slice which is required for
   // synchronization.
-  auto init_slice = [&](bool first_init = false) {
+  bool first_init = true;
+  auto init_part2_slice = [&]() {
     slice_iters =
         iters * (blockIdx.x + 1) - (k_tiles * slice_col_par + slice_row);
-    if (slice_iters < 0 || slice_col_par >= n_tiles * parallel) slice_iters = 0;
+    if (slice_iters < 0 || slice_col_par >= part2_mn_tiles) slice_iters = 0;
     if (slice_iters == 0) return;
     if (slice_row + slice_iters > k_tiles) slice_iters = k_tiles - slice_row;
     slice_count = 1;
@@ -436,7 +556,7 @@ __global__ void Marlin(
         if (col_off > 0) slice_idx--;
       }
     }
-    if (parallel * n_tiles >= gridDim.x) {
+    if (part2_mn_tiles >= gridDim.x) {
       if (slice_count > 1 && slice_idx == slice_count - 1) {
         locks_off++;
       }
@@ -466,28 +586,68 @@ __global__ void Marlin(
     }
 
     if (slice_col == n_tiles) {
-      A += 16 * thread_m_blocks * lda / 8;
+      A += 16 * thread_m_blocks * lda / (is_a_8bit ? 16 : 8);
       C += 16 * thread_m_blocks * prob_n / 8;
       slice_col = 0;
       par_id++;
     }
+    if (is_a_8bit && (first_init || slice_col == 0)) {
+      __syncthreads();
+      int a_s_gl_rd = par_id * 16 * thread_m_blocks + threadIdx.x;
+      cp_async1_ca_pred(&sh_a_s[threadIdx.x], &a_scales_ptr[a_s_gl_rd],
+                        threadIdx.x < prob_m);
+    }
   };
-  init_slice(true);
+
+  auto init_part1_slice = [&]() {
+    if (part1_mn_iters) {
+      part1_mn_iters--;
+      par_id = slice_col_par / n_tiles;
+      slice_col = slice_col_par % n_tiles;
+      slice_iters = k_tiles;
+      A = A0 + 16 * thread_m_blocks / (is_a_8bit ? 16 : 8) * par_id * lda;
+      C = C0 + 16 * thread_m_blocks / 8 * par_id * prob_n;
+      if (is_a_8bit) {
+        __syncthreads();
+        int a_s_gl_rd = par_id * 16 * thread_m_blocks + threadIdx.x;
+        cp_async1_ca_pred(&sh_a_s[threadIdx.x], &a_scales_ptr[a_s_gl_rd],
+                          threadIdx.x < prob_m);
+      }
+    }
+  };
+
+  auto init_slice = [&]() {
+    if (!in_part2 && !part1_mn_iters) {
+      in_part2 = true;
+      slice_col_par = (iters * blockIdx.x) / k_tiles;
+      slice_row = (iters * blockIdx.x) % k_tiles;
+      slice_col = (slice_col_par + global_mn_tiles - part2_mn_tiles) % n_tiles;
+      par_id = (slice_col_par + global_mn_tiles - part2_mn_tiles) / n_tiles;
+      A = A0 + 16 * thread_m_blocks / (is_a_8bit ? 16 : 8) * par_id * lda;
+      C = C0 + 16 * thread_m_blocks / 8 * par_id * prob_n;
+    }
+    if (!in_part2) {
+      init_part1_slice();
+    } else {
+      init_part2_slice();
+      first_init = false;
+    }
+  };
+
+  init_slice();
 
   // A sizes/strides
 
   // stride of the A matrix in global memory
-  int a_gl_stride = lda / 8;
+  int a_gl_stride = lda / (is_a_8bit ? 16 : 8);
   // stride of an A matrix tile in shared memory
-  constexpr int a_sh_stride = 16 * thread_k_blocks / 8;
+  constexpr int a_sh_stride = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // delta between subsequent A tiles in global memory
-  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / 8;
+  constexpr int a_gl_rd_delta_o = 16 * thread_k_blocks / (is_a_8bit ? 16 : 8);
   // between subsequent accesses within a tile
   int a_gl_rd_delta_i = a_gl_stride * (threads / a_gl_rd_delta_o);
   // between shared memory writes
   constexpr int a_sh_wr_delta = a_sh_stride * (threads / a_gl_rd_delta_o);
-  // between shared memory tile reads
-  constexpr int a_sh_rd_delta_o = 2 * ((threads / 32) / (thread_n_blocks / 4));
   // within a shared memory tile
   constexpr int a_sh_rd_delta_i = a_sh_stride * 16;
   // overall size of a tile
@@ -496,24 +656,25 @@ __global__ void Marlin(
   constexpr int a_sh_wr_iters = div_ceil(a_sh_stage, a_sh_wr_delta);
 
   // B sizes/strides
-  int b_gl_stride = 16 * prob_n / (pack_factor * 4);
-  constexpr int b_sh_stride = ((thread_n_blocks * 16) * 16 / pack_factor) / 4;
-  constexpr int b_thread_vecs = w_type.size_bits() == 4 ? 1 : 2;
+  int b_gl_stride = 16 * prob_n / (pack_factor * (is_a_8bit ? 2 : 4));
+  constexpr int b_sh_stride =
+      ((thread_n_blocks * 16) * 16 / pack_factor) / (is_a_8bit ? 2 : 4);
+  constexpr int b_thread_vecs = b_type.size_bits() == 4 ? 1 : 2;
   constexpr int b_sh_stride_threads = b_sh_stride / b_thread_vecs;
 
-  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks;
-  int b_gl_rd_delta_i = b_gl_stride * (threads / b_sh_stride_threads);
+  int b_gl_rd_delta_o = b_gl_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_delta = threads * b_thread_vecs;
-  constexpr int b_sh_rd_delta = threads * b_thread_vecs;
-  constexpr int b_sh_stage = b_sh_stride * thread_k_blocks;
+  constexpr int b_sh_stage =
+      b_sh_stride * thread_k_blocks / (is_a_8bit ? 2 : 1);
   constexpr int b_sh_wr_iters = b_sh_stage / b_sh_wr_delta;
 
   // Scale sizes/strides without act_order
-  int s_gl_stride = prob_n / 8;
-  constexpr int s_sh_stride = 16 * thread_n_blocks / 8;
+  int s_gl_stride = prob_n / (b_type == vllm::kFE2M1f ? 16 : 8);
+  constexpr int s_sh_stride =
+      16 * thread_n_blocks / (b_type == vllm::kFE2M1f ? 16 : 8);
   constexpr int s_tb_groups =
       !has_act_order && group_blocks != -1 && group_blocks < thread_k_blocks
-          ? thread_k_blocks / group_blocks / (w_type == vllm::kFE2M1f ? 2 : 1)
+          ? thread_k_blocks / group_blocks
           : 1;
   constexpr int s_sh_stage = s_tb_groups * s_sh_stride;
   int s_gl_rd_delta = s_gl_stride;
@@ -527,7 +688,7 @@ __global__ void Marlin(
   int act_s_col_stride = 1;
   int act_s_col_warp_stride = act_s_col_stride * 8;
 
-  int tb_n_warps = thread_n_blocks / 4;
+  constexpr int tb_n_warps = thread_n_blocks / (is_a_8bit ? 2 : 4);
   int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps;
 
   // Zero-points sizes/strides
@@ -550,17 +711,22 @@ __global__ void Marlin(
   int a_sh_rd =
       a_sh_stride * ((threadIdx.x % 32) % (16 / (m_block_size_8 ? 2 : 1))) +
       (threadIdx.x % 32) / (16 / (m_block_size_8 ? 2 : 1));
-  a_sh_rd += 2 * ((threadIdx.x / 32) / (thread_n_blocks / 4));
+  a_sh_rd += 2 * ((threadIdx.x / 32) / tb_n_warps) * b_sh_wr_iters;
+
+  int b_gl_rd;
+  if (threads <= b_sh_stride) {
+    b_gl_rd = threadIdx.x;
+  } else {
+    b_gl_rd =
+        b_gl_stride * (threadIdx.x / b_sh_stride) + (threadIdx.x % b_sh_stride);
+  }
 
-  int b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride_threads) +
-                (threadIdx.x % b_sh_stride_threads) * b_thread_vecs;
   b_gl_rd += b_sh_stride * slice_col;
   b_gl_rd += b_gl_rd_delta_o * slice_row;
-  auto b_sh_wr = threadIdx.x * b_thread_vecs;
   auto b_sh_rd = threadIdx.x * b_thread_vecs;
+  b_sh_rd += b_sh_rd / b_sh_stride * (b_sh_stride * (b_sh_wr_iters - 1));
 
   // For act_order
-  constexpr int k_iter_size = tb_k / b_sh_wr_iters;
   int slice_k_start = tb_k * slice_row;
   int slice_k_finish = slice_k_start + tb_k * slice_iters;
   int slice_k_start_shared_fetch = slice_k_start;
@@ -571,58 +737,54 @@ __global__ void Marlin(
   if constexpr (!has_act_order) {
     if constexpr (group_blocks == -1) {
       s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-    } else {
-      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) /
-                    (w_type == vllm::kFE2M1f ? 2 : 1) +
+    } else if constexpr (group_blocks >= thread_k_blocks) {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                 s_sh_stride * slice_col + threadIdx.x;
+    } else {
+      s_gl_rd = s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
     }
   }
   auto s_sh_wr = threadIdx.x;
-  bool s_sh_wr_pred = threadIdx.x < s_sh_stride;
+  bool s_sh_wr_pred = threadIdx.x < s_sh_stage;
 
   // Zero-points
   int zp_gl_rd;
   if constexpr (has_zp) {
     if constexpr (group_blocks == -1) {
       zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
-    } else {
+    } else if constexpr (group_blocks >= thread_k_blocks) {
       zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
                  zp_sh_stride * slice_col + threadIdx.x;
+    } else {
+      zp_gl_rd = zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                 threadIdx.x / zp_sh_stride) +
+                 zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
     }
   }
   auto zp_sh_wr = threadIdx.x;
-  bool zp_sh_wr_pred = threadIdx.x < zp_sh_stride;
+  bool zp_sh_wr_pred = zp_sh_stage > 0 && threadIdx.x < zp_sh_stage;
 
   // We use a different scale layout for grouped and column-wise quantization as
   // we scale a `half2` tile in column-major layout in the former and in
   // row-major in the latter case.
   int s_sh_rd;
-  if constexpr (group_blocks != -1 && w_type == vllm::kFE2M1f) {
-    auto warp_id = threadIdx.x / 32;
-    int n_warps = thread_n_blocks / 4;
-    int warp_row = warp_id / n_warps;
-
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
-    s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
-
+  if constexpr (is_a_8bit) {
+    s_sh_rd = 4 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 4);
   } else if constexpr (group_blocks != -1)
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
   else if constexpr (group_blocks == -1 &&
                      (m_block_size_8 || (has_zp && !dequant_skip_flop)))
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) / 8;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   else
-    s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-              (threadIdx.x % 32) % 4;
+    s_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) % 4;
 
   int bias_sh_rd;
   if constexpr (m_block_size_8) {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                 (threadIdx.x % 32) / 8;
+    bias_sh_rd = 8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 8;
   } else {
-    bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+    bias_sh_rd = (is_a_8bit ? 4 : 8) * ((threadIdx.x / 32) % tb_n_warps) +
                  (threadIdx.x % 32) % 4;
   }
 
@@ -638,12 +800,16 @@ __global__ void Marlin(
   if constexpr (has_zp) {
     if constexpr (is_zp_float) {
       if constexpr (group_blocks != -1) {
-        zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
-                   (threadIdx.x % 32) / 4;
+        zp_sh_rd =
+            8 * ((threadIdx.x / 32) % tb_n_warps) + (threadIdx.x % 32) / 4;
       }
+    } else if (is_a_8bit) {
+      zp_sh_rd = num_ints_per_thread * num_col_threads *
+                     ((threadIdx.x / 32) % tb_n_warps / 2) +
+                 num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     } else {
       zp_sh_rd = num_ints_per_thread * num_col_threads *
-                     ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
+                     ((threadIdx.x / 32) % tb_n_warps) +
                  num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads);
     }
   }
@@ -678,26 +844,19 @@ __global__ void Marlin(
   for (int i = 0; i < b_sh_wr_iters; i++) {
   #pragma unroll
     for (int j = 0; j < thread_m_blocks; j++)
-      a_sh_rd_trans[i][j] =
-          transform_a(a_sh_rd_delta_o * i + a_sh_rd_delta_i * j + a_sh_rd);
+      a_sh_rd_trans[i][j] = transform_a(2 * i + a_sh_rd_delta_i * j + a_sh_rd);
   }
 
   // Since B-accesses have non-constant stride they have to be computed at
   // runtime; we break dependencies between subsequent accesses with a tile by
   // maintining multiple pointers (we have enough registers), a tiny
   // optimization.
-  const int4* B_ptr[b_sh_wr_iters];
-  #pragma unroll
-  for (int i = 0; i < b_sh_wr_iters; i++)
-    B_ptr[i] = B + b_gl_rd_delta_i * i + b_gl_rd;
 
-  extern __shared__ int4 sh[];
   // Shared memory storage for global fetch pipelines.
   constexpr int sh_red_size = (2 * thread_n_blocks + 1) * 16 * thread_m_blocks;
   constexpr int sh_b_size = stages * b_sh_stage;
-  int4* sh_b = sh;
-  int4* sh_red = sh;
-
+  int4* sh_b = sh_new;
+  int4* sh_red = sh_new;
   constexpr int sh_size_b_red_min =
       (sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
   constexpr int sh_size_b_red_max =
@@ -708,8 +867,8 @@ __global__ void Marlin(
           ? sh_size_b_red_max
           : (sh_size_b_red_min + sh_bias_size);
 
-  int4* sh_bias = sh + sh_size_b_red_min;
-  int4* sh_g_idx = sh + sh_b_red_bias_size;
+  int4* sh_bias = sh_new + sh_size_b_red_min;
+  int4* sh_g_idx = sh_new + sh_b_red_bias_size;
   int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
   constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
                                           : (stages * s_sh_stage);
@@ -723,7 +882,8 @@ __global__ void Marlin(
   // Register storage for double buffer of shared memory reads.
   FragA frag_a[2][thread_m_blocks];
   I4 frag_b_quant[2][b_thread_vecs];
-  FragC frag_c[thread_m_blocks][4][2];
+  FragC frag_c[thread_m_blocks][is_a_8bit ? 2 : 4][2];
+  FragC frag_c_tmp[thread_m_blocks][is_a_8bit ? 2 : 4][2];
   FragS frag_s[2][4];  // No act-order
   FragS frag_bias[2][4];
   FragS act_frag_s[2][4][4];             // For act-order
@@ -731,6 +891,24 @@ __global__ void Marlin(
   FragZP frag_zp;                        // Zero-points in fp16
   FragZP frag_zpf[2];                    // Zero-points in fp16 in HQQ
 
+  if constexpr (is_a_8bit) {
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][0][g] = 0.0f;
+        }
+
+  #pragma unroll
+        for (int g = 0; g < 4; g++) {
+          frag_c_tmp[i][j][1][g] = 0.0f;
+        }
+      }
+    }
+  }
+
   // Zero accumulators.
   auto zero_accums = [&]() {
   #pragma unroll
@@ -788,15 +966,17 @@ __global__ void Marlin(
       }
       int4* sh_b_stage = sh_b + b_sh_stage * pipe;
   #pragma unroll
-      for (int i = 0; i < b_sh_wr_iters; i++) {
-  #pragma unroll
-        for (int j = 0; j < b_thread_vecs; j++) {
-          cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr + j], B_ptr[i] + j);
-        }
+      for (int i = 0; i < (b_sh_wr_iters * b_thread_vecs); i++) {
+        constexpr int count = div_ceil(b_sh_stride, threads);
+        int b_gl_idx =
+            b_gl_rd + (i % count) * threads +
+            b_gl_stride * (i / count) * div_ceil(threads, b_sh_stride);
 
-        B_ptr[i] += b_gl_rd_delta_o;
+        cp_async4(&sh_b_stage[threads * i + threadIdx.x], &B[b_gl_idx]);
       }
 
+      b_gl_rd += b_gl_rd_delta_o;
+
       if constexpr (has_act_order) {
         // Fetch g_idx thread-block portion
         int full_pipe = a_off;
@@ -816,44 +996,24 @@ __global__ void Marlin(
         if constexpr (group_blocks != -1) {
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch scales if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < s_tb_groups; i++) {
-              if (s_sh_wr_pred) {
-                cp_async4(&sh_s_stage[i * s_sh_stride + s_sh_wr],
-                          &scales_ptr[s_gl_rd]);
-              }
-              s_gl_rd += s_gl_rd_delta;
+          // Only fetch scales if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (s_sh_wr_pred) {
+              cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]);
             }
+            s_gl_rd += s_gl_rd_delta * s_tb_groups;
           }
         }
 
         if constexpr (has_zp && group_blocks != -1) {
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
-          if constexpr (group_blocks >= thread_k_blocks) {
-            // Only fetch zero-points if this tile starts a new group
-            if (pipe % (group_blocks / thread_k_blocks) == 0) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
-            }
-          } else {
-            for (int i = 0; i < zp_tb_groups; i++) {
-              if (zp_sh_wr_pred) {
-                cp_async4(&sh_zp_stage[i * zp_sh_stride + zp_sh_wr],
-                          &zp_ptr[zp_gl_rd]);
-              }
-              zp_gl_rd += zp_gl_rd_delta;
+          // Only fetch zero points if this tile starts a new group
+          if (pipe % div_ceil(group_blocks, thread_k_blocks) == 0) {
+            if (zp_sh_wr_pred) {
+              cp_async4(&sh_zp_stage[zp_sh_wr], &zp_ptr[zp_gl_rd]);
             }
+            zp_gl_rd += zp_gl_rd_delta * zp_tb_groups;
           }
         }
       }
@@ -891,14 +1051,14 @@ __global__ void Marlin(
     int4* sh_a_stage = sh_a + a_sh_stage * pipe;
   #pragma unroll
     for (int i = 0; i < thread_m_blocks; i++)
-      ldsm<m_block_size_8 ? 2 : 4, scalar_t>(
+      ldsm<m_block_size_8 ? 2 : 4, a_type_id>(
           frag_a[k % 2][i], &sh_a_stage[a_sh_rd_trans[k % b_sh_wr_iters][i]]);
     int4* sh_b_stage = sh_b + b_sh_stage * pipe;
 
   #pragma unroll
     for (int i = 0; i < b_thread_vecs; i++) {
       frag_b_quant[k % 2][i] = *reinterpret_cast<I4*>(
-          &sh_b_stage[b_sh_rd_delta * (k % b_sh_wr_iters) + b_sh_rd + i]);
+          &sh_b_stage[b_sh_stride * (k % b_sh_wr_iters) + b_sh_rd + i]);
     }
   };
 
@@ -922,53 +1082,54 @@ __global__ void Marlin(
 
   auto fetch_scales_to_registers = [&](int k, int full_pipe) {
     int pipe = full_pipe % stages;
+    using IT1 = typename std::conditional_t<is_a_8bit, int2, int4>;
+    using IT0 = typename std::conditional_t<is_a_8bit, int, int2>;
+    constexpr int group_blocks2 = div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
     if constexpr (!has_act_order) {
       // No act-order case
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 && dequant_skip_flop) {
           reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd];
           reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
         }
       } else if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_s_stage =
-                sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
-            reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
-          } else {
-            reinterpret_cast<int4*>(&frag_s[1])[0] =
-                reinterpret_cast<int4*>(&frag_s[0])[0];
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0) {
+            if (k % b_sh_wr_iters == 0) {
+              int4* sh_s_stage = sh_s + s_sh_stage * (g * (pipe / g));
+              reinterpret_cast<int4*>(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd];
+            } else {
+              reinterpret_cast<int4*>(&frag_s[1])[0] =
+                  reinterpret_cast<int4*>(&frag_s[0])[0];
+            }
           }
-        } else {
+        } else if (group_blocks2 < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
+          int warp_row = warp_id / tb_n_warps;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          int cur_group_id =
-              k_blocks / (group_blocks * (w_type == vllm::kFE2M1f ? 2 : 1));
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+          int cur_group_id = k_blocks / group_blocks2;
 
           int4* sh_s_stage = sh_s + s_sh_stage * pipe;
 
-          if constexpr (w_type_id != vllm::kFE2M1f.id()) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
             reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
                 sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
-          } else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
+          } else {
             reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
                 reinterpret_cast<int2*>(
                     sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
+          }
+        } else if (group_blocks >= b_sh_wr_iters) {
+          if constexpr (b_type_id != vllm::kFE2M1f.id()) {
+            reinterpret_cast<int4*>(&frag_s[1])[0] =
+                reinterpret_cast<int4*>(&frag_s[0])[0];
           } else {
-            reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
-                reinterpret_cast<int2*>(
-                    sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
-                                k % 2];
+            reinterpret_cast<int2*>(&frag_s[1])[0] =
+                reinterpret_cast<int2*>(&frag_s[0])[0];
           }
         }
       }
@@ -989,18 +1150,15 @@ __global__ void Marlin(
     cur_k = 0;
 
     // Progress to current iteration
-    cur_k += k_iter_size * (k % b_sh_wr_iters);
+    cur_k += k % b_sh_wr_iters;
 
     // Determine "position" inside the thread-block (based on warp and
     // thread-id)
     auto warp_id = threadIdx.x / 32;
-    int n_warps =
-        thread_n_blocks / 4;  // Each warp processes 4 16-size tiles over N
-
-    int warp_row = warp_id / n_warps;
-    int warp_col = warp_id % n_warps;
+    int warp_row = warp_id / tb_n_warps;
+    int warp_col = warp_id % tb_n_warps;
 
-    cur_k += warp_row * 16;
+    cur_k += warp_row * 16 * b_sh_wr_iters;
 
     auto th_id = threadIdx.x % 32;
     cur_k += (th_id % 4) * 2;  // Due to tensor-core layout for fp16 B matrix
@@ -1055,18 +1213,16 @@ __global__ void Marlin(
 
       if constexpr (group_blocks == -1) {
         // load only when starting a new slice
-        if (k == 0 && full_pipe == 0) {
+        if (k == 0 && full_pipe == 0 || is_a_8bit) {
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] = (reinterpret_cast<int*>(sh_zp))[zp_sh_rd + i];
           }
         }
-
       } else if constexpr (group_blocks >= thread_k_blocks) {
-        if (k % b_sh_wr_iters == 0) {
-          int4* sh_zp_stage =
-              sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                                     (pipe / (group_blocks / thread_k_blocks)));
+        constexpr int g = group_blocks / thread_k_blocks;
+        if (pipe % g == 0 && k % b_sh_wr_iters == 0 || is_a_8bit) {
+          int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
   #pragma unroll
           for (int i = 0; i < num_ints_per_thread; i++) {
             frag_qzp[k % 2][i] =
@@ -1075,21 +1231,11 @@ __global__ void Marlin(
         }
       } else {
         auto warp_id = threadIdx.x / 32;
-        int n_warps = thread_n_blocks / 4;
-
-        int warp_row = warp_id / n_warps;
 
-        int cur_k = warp_row * 16;
-        cur_k += k_iter_size * (k % b_sh_wr_iters);
+        int warp_row = warp_id / tb_n_warps;
 
-        int k_blocks = cur_k / 16;
-        int cur_group_id = 0;
-
-        // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
-        cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
+        int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
+        int cur_group_id = k_blocks / div_ceil(group_blocks, is_a_8bit ? 2 : 1);
 
         int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1108,29 +1254,18 @@ __global__ void Marlin(
 
       if constexpr (group_blocks != -1) {
         if constexpr (group_blocks >= thread_k_blocks) {
-          if (k % b_sh_wr_iters == 0) {
-            int4* sh_zp_stage =
-                sh_zp +
-                zp_sh_stage * ((group_blocks / thread_k_blocks) *
-                               (pipe / (group_blocks / thread_k_blocks)));
+          constexpr int g = group_blocks / thread_k_blocks;
+          if (pipe % g == 0 && k % b_sh_wr_iters == 0) {
+            int4* sh_zp_stage = sh_zp + zp_sh_stage * (g * (pipe / g));
             reinterpret_cast<int4*>(&frag_zpf[k % 2])[0] =
                 sh_zp_stage[zp_sh_rd];
           }
-        } else {
+        } else if (group_blocks < b_sh_wr_iters || k % b_sh_wr_iters == 0) {
           auto warp_id = threadIdx.x / 32;
-          int n_warps = thread_n_blocks / 4;
 
-          int warp_row = warp_id / n_warps;
-
-          int cur_k = warp_row * 16;
-          cur_k += k_iter_size * (k % b_sh_wr_iters);
-
-          int k_blocks = cur_k / 16;
-          // Suppress bogus and persistent divide-by-zero warning
-  #pragma nv_diagnostic push
-  #pragma nv_diag_suppress divide_by_zero
+          int warp_row = warp_id / tb_n_warps;
+          int k_blocks = b_sh_wr_iters * warp_row + k % b_sh_wr_iters;
           int cur_group_id = k_blocks / group_blocks;
-  #pragma nv_diagnostic pop
 
           int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe;
 
@@ -1141,33 +1276,46 @@ __global__ void Marlin(
     }
   };
 
-  auto dequant_data = [&](int q, scalar_t2* frag_b_ptr) {
-    dequant<scalar_t2, w_type_id, dequant_skip_flop>(q, frag_b_ptr);
+  auto dequant_data = [&](int q, scalar_32bit_t* frag_b_ptr, int zp = 0) {
+    if constexpr (a_type.size_bits() != b_type.size_bits()) {
+      if constexpr (is_a_8bit && has_zp) {
+        sub_zp_and_dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(
+            q, frag_b_ptr, zp);
+      } else {
+        dequant<scalar_32bit_t, b_type_id, dequant_skip_flop>(q, frag_b_ptr);
+      }
+    }
   };
 
   // Execute the actual tensor core matmul of a sub-tile.
   bool is_first_matmul_in_slice = true;
-  auto matmul = [&](int k) {
+  auto matmul = [&](int k, int pipe) {
+    if (is_a_8bit) return;
     int k2 = k % 2;
+    constexpr int g =
+        group_blocks > 0 ? div_ceil(group_blocks, thread_k_blocks) : 1;
     const bool is_new_zp =
-        ((group_blocks != -1) && (group_blocks < thread_k_blocks || k == 0)) ||
+        (group_blocks == 0) ||
+        ((group_blocks > 0) && (group_blocks < b_sh_wr_iters || k == 0)) &&
+            (pipe % g == 0) ||
         (group_blocks == -1 && is_first_matmul_in_slice);
     if constexpr (has_zp && !is_zp_float) {
       if (is_new_zp) {
         if constexpr (group_blocks == -1) is_first_matmul_in_slice = false;
         int zp_quant_0, zp_quant_1;
 
-        if constexpr (w_type.size_bits() == 4) {
+        if constexpr (b_type.size_bits() == 4) {
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = zp_quant_0 >> 8;
         } else {
-          static_assert(w_type.size_bits() == 8);
+          static_assert(b_type.size_bits() == 8);
           zp_quant_0 = frag_qzp[k2][0];
           zp_quant_1 = frag_qzp[k2][1];
         }
 
-        dequant_data(zp_quant_0, reinterpret_cast<scalar_t2*>(&frag_zp));
-        dequant_data(zp_quant_1, reinterpret_cast<scalar_t2*>(&frag_zp) + 2);
+        dequant_data(zp_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_zp));
+        dequant_data(zp_quant_1,
+                     reinterpret_cast<scalar_32bit_t*>(&frag_zp) + 2);
       }
     }
     if constexpr (!dequant_skip_flop && has_zp && is_zp_float) {
@@ -1177,14 +1325,14 @@ __global__ void Marlin(
       }
     }
 
-    if constexpr (w_type == vllm::kFE2M1f) {
+    if constexpr (b_type == vllm::kFE2M1f) {
       int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
       int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
 
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
-      dequant_fp8_scales<scalar_t2, s_type_id>(
-          s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_0, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]));
+      dequant_fp8_scales<c_scalar_t2, s_type_id>(
+          s_quant_1, reinterpret_cast<c_scalar_t2*>(&frag_s[k2]) + 2);
     }
 
   // We have the m dimension as the inner loop in order to encourage overlapping
@@ -1195,61 +1343,168 @@ __global__ void Marlin(
       FragB frag_b1;
       int b_quant_0, b_quant_1;
 
-      if constexpr (w_type_id == vllm::kFE2M1f.id()) {
+      if constexpr (b_type_id == vllm::kFE2M1f.id()) {
         b_quant_1 = frag_b_quant[k2][0][j];
         b_quant_0 = b_quant_1 << 8;
-      } else if constexpr (w_type.size_bits() == 4) {
+      } else if constexpr (b_type.size_bits() == 4) {
         b_quant_0 = frag_b_quant[k2][0][j];
         b_quant_1 = b_quant_0 >> 8;
       } else {
-        static_assert(w_type.size_bits() == 8);
+        static_assert(b_type.size_bits() == 8);
         int* frag_b_quant_ptr = reinterpret_cast<int*>(frag_b_quant[k2]);
         b_quant_0 = frag_b_quant_ptr[j * 2 + 0];
         b_quant_1 = frag_b_quant_ptr[j * 2 + 1];
       }
 
-      dequant_data(b_quant_0, reinterpret_cast<scalar_t2*>(&frag_b0));
-      dequant_data(b_quant_1, reinterpret_cast<scalar_t2*>(&frag_b1));
+      dequant_data(b_quant_0, reinterpret_cast<scalar_32bit_t*>(&frag_b0));
+      dequant_data(b_quant_1, reinterpret_cast<scalar_32bit_t*>(&frag_b1));
 
-      if constexpr (dequant_skip_flop && has_zp && !is_zp_float) {
-        sub_zp<scalar_t>(frag_b0, frag_zp[j], 0);
-        sub_zp<scalar_t>(frag_b1, frag_zp[j], 1);
+      if constexpr (dequant_skip_flop && has_zp && !is_zp_float && !is_a_8bit) {
+        sub_zp<a_type_id>(frag_b0, frag_zp[j], 0);
+        sub_zp<a_type_id>(frag_b1, frag_zp[j], 1);
       }
 
       // Apply scale to frag_b0
-      if constexpr (has_act_order) {
+      if constexpr (has_act_order && !is_a_8bit) {
         static_assert(group_blocks != -1);
-        scale4<scalar_t>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
-        scale4<scalar_t>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
-                         act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
+        scale4<a_type_id>(frag_b0, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 0);
+        scale4<a_type_id>(frag_b1, act_frag_s[k2][0][j], act_frag_s[k2][1][j],
+                          act_frag_s[k2][2][j], act_frag_s[k2][3][j], 1);
       } else if constexpr (!dequant_skip_flop && has_zp && !is_zp_float &&
-                           group_blocks == -1) {
+                           group_blocks == -1 && !is_a_8bit) {
         int idx = (threadIdx.x / 4) % 2;
-        scalar_t2 s2 = Dtype::nums2num2(
+        scalar_t2 s2 = Adtype::nums2num2(
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 0])[idx],
             reinterpret_cast<scalar_t*>(&frag_s[j / 2][j % 2 * 2 + 1])[idx]);
         if (is_new_zp) frag_zp[j] = __hmul2(frag_zp[j], s2);
-        scale_and_sub<scalar_t>(frag_b0, s2.x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, s2.y, frag_zp[j].y);
-      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1) {
+        scale_and_sub<a_type_id>(frag_b0, s2.x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, s2.y, frag_zp[j].y);
+      } else if constexpr (!dequant_skip_flop && has_zp && group_blocks != -1 &&
+                           !is_a_8bit) {
         if (is_new_zp)
           frag_zp[j] = __hmul2(frag_zp[j],
                                *reinterpret_cast<scalar_t2*>(&frag_s[k2][j]));
-        scale_and_sub<scalar_t>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
-        scale_and_sub<scalar_t>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
-      } else if constexpr (group_blocks != -1) {
-        scale<scalar_t>(frag_b0, frag_s[k2][j], 0);
-        scale<scalar_t>(frag_b1, frag_s[k2][j], 1);
+        scale_and_sub<a_type_id>(frag_b0, frag_s[k2][j][0].x, frag_zp[j].x);
+        scale_and_sub<a_type_id>(frag_b1, frag_s[k2][j][0].y, frag_zp[j].y);
+      } else if constexpr (group_blocks != -1 && !is_a_8bit) {
+        scale<a_type_id>(frag_b0, frag_s[k2][j], 0);
+        scale<a_type_id>(frag_b1, frag_s[k2][j], 1);
       }
 
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
         if constexpr (m_block_size_8) {
-          mma_trans<scalar_t>(frag_a[k2][i], frag_b0, frag_b1, frag_c[i][j][0]);
+          mma_trans<a_type_id>(frag_a[k2][i], frag_b0, frag_b1,
+                               frag_c[i][j][0]);
         } else {
-          mma<scalar_t>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
-          mma<scalar_t>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+          mma<a_type_id>(frag_a[k2][i], frag_b0, frag_c[i][j][0]);
+          mma<a_type_id>(frag_a[k2][i], frag_b1, frag_c[i][j][1]);
+        }
+      }
+    }
+  };
+
+  auto matmul_a8 = [&](int k) {
+    int k2 = k % 2;
+  #pragma unroll
+    for (int j = 0; j < 2; j++) {
+      FragB frag_b[2];
+
+      if (is_a_8bit && b_type.size_bits() == 4 && !has_zp) {
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b));
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2);
+      } else if (is_a_8bit && b_type.size_bits() == 4 && has_zp) {
+        int off = (threadIdx.x / 32) % 2 * 2 + j;
+        int zp = (frag_qzp[k2][0] >> (off * 8)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b), zp);
+        zp = (frag_qzp[k2][0] >> (off * 8 + 4)) & 0xF;
+        dequant_data(frag_b_quant[k2][0][j * 2 + 1],
+                     reinterpret_cast<scalar_32bit_t*>(&frag_b) + 2, zp);
+      } else {
+        reinterpret_cast<int2*>(&frag_b)[0] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[0];
+        reinterpret_cast<int2*>(&frag_b)[1] =
+            reinterpret_cast<int2*>(&frag_b_quant[k2][j])[1];
+      }
+
+  #pragma unroll
+      for (int i = 0; i < thread_m_blocks; i++) {
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[0],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][0]);
+        mma<a_type_id, 32>(frag_a[k2][i], frag_b[1],
+                           (group_blocks == -1 ? frag_c : frag_c_tmp)[i][j][1]);
+      }
+
+      if constexpr (group_blocks != -1) {
+        if (group_blocks == 2 || k == 1) {
+          if constexpr (a_type == vllm::kS8) {
+            int2 s_vals[2];
+            s_vals[0] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2][0])[1]};
+            s_vals[1] = {
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[0],
+                (int)reinterpret_cast<uint16_t*>(&frag_s[k2][j * 2 + 1][0])[1]};
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[0])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][0][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][0][g]) *
+                    scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                int scale = reinterpret_cast<int*>(&s_vals[1])[g % 2];
+                *reinterpret_cast<int32_t*>(&frag_c[i][j][1][g]) +=
+                    *reinterpret_cast<int32_t*>(&frag_c_tmp[i][j][1][g]) *
+                    scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          } else {
+            float2 s_vals[2];
+            if constexpr (s_type_id != vllm::kFE8M0fnu.id()) {
+              static_assert(a_type.size_bits() == 16 ||
+                            s_type.size_bits() == 16);
+              s_vals[0] = Cdtype::num22float2(frag_s[k2][j * 2][0]);
+              s_vals[1] = Cdtype::num22float2(frag_s[k2][j * 2 + 1][0]);
+            } else {
+              int32_t* s_vals_int = reinterpret_cast<int32_t*>(&s_vals[0]);
+              int32_t s_vals_e8m0 =
+                  *reinterpret_cast<int32_t*>(&frag_s[k2][j][0]);
+
+              s_vals_int[0] = (s_vals_e8m0 & 0xFF) << 23;
+              s_vals_int[1] = (s_vals_e8m0 & 0xFF00) << 15;
+              s_vals_int[2] = (s_vals_e8m0 & 0xFF0000) << 7;
+              s_vals_int[3] = (s_vals_e8m0 & 0xFF000000) >> 1;
+            }
+
+  #pragma unroll
+            for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[0])[g % 2];
+                frag_c[i][j][0][g] += frag_c_tmp[i][j][0][g] * scale;
+                frag_c_tmp[i][j][0][g] = 0.0f;
+              }
+
+  #pragma unroll
+              for (int g = 0; g < 4; g++) {
+                float scale = reinterpret_cast<float*>(&s_vals[1])[g % 2];
+                frag_c[i][j][1][g] += frag_c_tmp[i][j][1][g] * scale;
+                frag_c_tmp[i][j][1][g] = 0.0f;
+              }
+            }
+          }
         }
       }
     }
@@ -1263,7 +1518,8 @@ __global__ void Marlin(
     constexpr int red_off = threads / b_sh_stride_threads / 2;
     if (red_off >= 1) {
       auto red_idx = threadIdx.x / b_sh_stride_threads;
-      constexpr int red_sh_stride = b_sh_stride_threads * 4 * 2;
+      constexpr int red_sh_stride =
+          b_sh_stride_threads * (is_a_8bit ? 2 : 4) * 2;
       constexpr int red_sh_delta = b_sh_stride_threads;
       int red_sh_rd = red_sh_stride * (threadIdx.x / b_sh_stride_threads) +
                       (threadIdx.x % b_sh_stride_threads);
@@ -1278,7 +1534,8 @@ __global__ void Marlin(
         for (int i = red_off; i > 0; i /= 2) {
           if (i <= red_idx && red_idx < 2 * i) {
   #pragma unroll
-            for (int j = 0; j < 4 * 2; j += (m_block_size_8 ? 2 : 1)) {
+            for (int j = 0; j < (is_a_8bit ? 2 : 4) * 2;
+                 j += (m_block_size_8 ? 2 : 1)) {
               int red_sh_wr =
                   red_sh_delta * j + (red_sh_rd - red_sh_stride * i);
               if (i < red_off) {
@@ -1287,24 +1544,26 @@ __global__ void Marlin(
                 float* c_wr = reinterpret_cast<float*>(&sh_red[red_sh_wr]);
   #pragma unroll
                 for (int k = 0; k < 4; k++)
-                  reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + j][k] +=
+                  reinterpret_cast<FragC*>(
+                      frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j][k] +=
                       c_rd[k] + c_wr[k];
               }
-              sh_red[red_sh_wr] =
-                  reinterpret_cast<int4*>(&frag_c)[4 * 2 * m_block + j];
+              sh_red[red_sh_wr] = reinterpret_cast<int4*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + j];
             }
           }
           __syncthreads();
         }
         if (red_idx == 0) {
   #pragma unroll
-          for (int i = 0; i < 4 * 2; i += (m_block_size_8 ? 2 : 1)) {
+          for (int i = 0; i < (is_a_8bit ? 2 : 4) * 2;
+               i += (m_block_size_8 ? 2 : 1)) {
             float* c_rd =
                 reinterpret_cast<float*>(&sh_red[red_sh_delta * i + red_sh_rd]);
   #pragma unroll
             for (int j = 0; j < 4; j++)
-              reinterpret_cast<FragC*>(frag_c)[4 * 2 * m_block + i][j] +=
-                  c_rd[j];
+              reinterpret_cast<FragC*>(
+                  frag_c)[(is_a_8bit ? 2 : 4) * 2 * m_block + i][j] += c_rd[j];
           }
         }
         __syncthreads();
@@ -1320,10 +1579,10 @@ __global__ void Marlin(
     // We are very careful here to reduce directly in the output buffer to
     // maximize L2 cache utilization in this step. To do this, we write out
     // results in FP16 (but still reduce with FP32 compute).
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     if (threadIdx.x < active_threads) {
       int c_gl_stride = prob_n / 8;
-      int c_gl_wr_delta_o = 8 * c_gl_stride;
+      int c_gl_wr_delta_o = 8 * c_gl_stride * (is_a_8bit ? 2 : 1);
       int c_gl_wr_delta_i = 4 * (active_threads / 32);
       int c_gl_wr;
       if constexpr (m_block_size_8) {
@@ -1331,9 +1590,9 @@ __global__ void Marlin(
                   4 * (threadIdx.x / 32) + (threadIdx.x % 32) / 8;
         c_gl_wr += (2 * thread_n_blocks) * slice_col;
       } else {
-        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) +
+        c_gl_wr = c_gl_stride * ((threadIdx.x % 32) / 4) * (is_a_8bit ? 2 : 1) +
                   4 * (threadIdx.x / 32) + threadIdx.x % 4;
-        c_gl_wr += (2 * thread_n_blocks) * slice_col;
+        c_gl_wr += (2 * thread_n_blocks) * slice_col * (is_a_8bit ? 2 : 1);
       }
       constexpr int c_sh_wr_delta = active_threads;
       auto c_sh_wr = threadIdx.x;
@@ -1351,6 +1610,14 @@ __global__ void Marlin(
                            &C[c_gl_wr + i * c_gl_stride +
                               (threadIdx.x % 8) / 4 * c_gl_wr_delta_i],
                            (threadIdx.x % 4) * 2 + i < prob_m);
+          } else if constexpr (is_a_8bit) {
+            int2* sh_red_int2 = reinterpret_cast<int2*>(sh_red);
+            int2* c_int2 = reinterpret_cast<int2*>(C);
+            cp_async2_ca_pred(
+                &sh_red_int2[c_sh_wr + c_sh_wr_delta * i],
+                &c_int2[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                        c_gl_wr_delta_i * (i % 2)],
+                i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m);
           } else {
             cp_async4_pred(
                 &sh_red[c_sh_wr + c_sh_wr_delta * i],
@@ -1370,36 +1637,51 @@ __global__ void Marlin(
                     (m_block_size_8) && ((threadIdx.x % 4) * 2 + i < prob_m);
         if (mask) {
           if (!first) {
-            int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta];
+            c_scalar_t* c_red_f16;
+            if constexpr (is_a_8bit) {
+              int2 tmp =
+                  reinterpret_cast<int2*>(sh_red)[c_sh_wr + i * c_sh_wr_delta];
+              c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+            } else {
+              int4 tmp = sh_red[c_sh_wr + i * c_sh_wr_delta];
+              c_red_f16 = reinterpret_cast<c_scalar_t*>(&tmp);
+            }
   #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
+            for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
               int delta = 0;
               if constexpr (m_block_size_8) {
                 delta = j % 2 == 1 ? -2 : 0;
               }
               reinterpret_cast<float*>(
-                  &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta] +=
-                  Dtype::num2float(reinterpret_cast<scalar_t*>(&c_red)[j]);
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j +
+                           (i % 4) + delta] += Cdtype::num2float(c_red_f16[j]);
             }
           }
           if (!last) {
-            int4 c;
+            c_scalar_t c_f16[is_a_8bit ? 4 : 8];
   #pragma unroll
-            for (int j = 0; j < 2 * 4; j++) {
+            for (int j = 0; j < 2 * (is_a_8bit ? 2 : 4); j++) {
               int delta = 0;
               if constexpr (m_block_size_8) {
                 delta = j % 2 == 1 ? -2 : 0;
               }
-              reinterpret_cast<scalar_t*>(&c)[j] =
-                  Dtype::float2num(reinterpret_cast<float*>(
-                      &frag_c)[4 * 2 * 4 * (i / 4) + 4 * j + (i % 4) + delta]);
+              c_f16[j] = Cdtype::float2num(reinterpret_cast<float*>(
+                  &frag_c)[(is_a_8bit ? 2 : 4) * 2 * 4 * (i / 4) + 4 * j +
+                           (i % 4) + delta]);
             }
-            if constexpr (m_block_size_8)
+            if constexpr (m_block_size_8) {
               C[c_gl_wr + i * c_gl_stride +
-                (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] = c;
-            else
+                (threadIdx.x % 8) / 4 * c_gl_wr_delta_i] =
+                  *reinterpret_cast<int4*>(c_f16);
+            } else if constexpr (is_a_8bit) {
+              int2* c_int2 = reinterpret_cast<int2*>(C);
+              c_int2[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
+                     c_gl_wr_delta_i * (i % 2)] =
+                  *reinterpret_cast<int2*>(c_f16);
+            } else {
               C[c_gl_wr + c_gl_wr_delta_o * (i / 2) +
-                c_gl_wr_delta_i * (i % 2)] = c;
+                c_gl_wr_delta_i * (i % 2)] = *reinterpret_cast<int4*>(c_f16);
+            }
           }
         }
       }
@@ -1414,10 +1696,10 @@ __global__ void Marlin(
 
     constexpr int c_size = tb_m * tb_n * sizeof(float) / 16;
 
-    constexpr int active_threads = 32 * thread_n_blocks / 4;
+    constexpr int active_threads = 32 * tb_n_warps;
     bool is_th_active = threadIdx.x < active_threads;
 
-    constexpr int num_floats = thread_m_blocks * 4 * 2 * 4;
+    constexpr int num_floats = thread_m_blocks * (is_a_8bit ? 2 : 4) * 2 * 4;
     constexpr int th_size = num_floats * sizeof(float) / 16;
 
     int c_cur_offset = locks_off * c_size;
@@ -1471,7 +1753,7 @@ __global__ void Marlin(
     } else {
       c_sh_wr =
           (4 * c_sh_stride) * ((threadIdx.x % 32) / 4) + (threadIdx.x % 32) % 4;
-      c_sh_wr += 32 * (threadIdx.x / 32);
+      c_sh_wr += (is_a_8bit ? 16 : 32) * (threadIdx.x / 32);
     }
 
     int c_sh_rd = c_sh_stride * (threadIdx.x / (2 * thread_n_blocks)) +
@@ -1481,47 +1763,47 @@ __global__ void Marlin(
     // We first reorder in shared memory to guarantee the most efficient final
     // global write patterns
     auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
-      scalar_t2 res =
-          Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
+      c_scalar_t2 res =
+          Cdtype::nums2num2(Cdtype::float2num(c0), Cdtype::float2num(c1));
 
       // For per-column quantization we finally apply the scale here (only for
       // 4-bit)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 4 &&
+      if constexpr (!has_act_order && group_blocks == -1 && !is_a_8bit &&
+                    b_type.size_bits() == 4 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        scalar_t2 tmp_scale = s[0];
+        c_scalar_t2 tmp_scale = s[0];
         if constexpr (m_block_size_8) {
-          tmp_scale = Dtype::num2num2(
+          tmp_scale = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hmul2(res, tmp_scale);
       }
 
-      if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
+      if constexpr (b_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
         res = __hmul2(res, global_scale);
       }
       if (has_bias && last) {
-        scalar_t2 tmp_bias = b_bias[0];
+        c_scalar_t2 tmp_bias = b_bias[0];
         if constexpr (m_block_size_8) {
-          tmp_bias = Dtype::num2num2(
+          tmp_bias = Cdtype::num2num2(
               reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
         }
         res = __hadd2(res, tmp_bias);
       }
 
       if constexpr (m_block_size_8) {
-        ((scalar_t*)sh_red)[idx] = res.x;
-        ((scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
+        ((c_scalar_t*)sh_red)[idx] = res.x;
+        ((c_scalar_t*)sh_red)[idx + 8 * c_sh_stride] = res.y;
       } else {
-        ((scalar_t2*)sh_red)[idx] = res;
+        ((c_scalar_t2*)sh_red)[idx] = res;
       }
     };
 
-    if (threadIdx.x / 32 < thread_n_blocks / 4) {
+    if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
       for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
-        for (int j = 0; j < 4; j++) {
+        for (int j = 0; j < (is_a_8bit ? 2 : 4); j++) {
           if constexpr (m_block_size_8) {
             int wr = c_sh_wr + 16 * j;
             write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
@@ -1557,9 +1839,9 @@ __global__ void Marlin(
          i++) {
       if (c_gl_wr < c_gl_wr_end) {
         if (use_atomic_add && slice_count > 1) {
-          scalar_t2* C_half2 = reinterpret_cast<scalar_t2*>(&C[c_gl_wr]);
-          scalar_t2* sh_red_half2 =
-              reinterpret_cast<scalar_t2*>(&sh_red[c_sh_rd]);
+          c_scalar_t2* C_half2 = reinterpret_cast<c_scalar_t2*>(&C[c_gl_wr]);
+          c_scalar_t2* sh_red_half2 =
+              reinterpret_cast<c_scalar_t2*>(&sh_red[c_sh_rd]);
   #pragma unroll
           for (int a = 0; a < 4; a++) {
             atomicAdd(&C_half2[a], sh_red_half2[a]);
@@ -1635,7 +1917,13 @@ __global__ void Marlin(
           wait_for_stage();
           init_same_group(pipe % stages);
         }
-        matmul(k);
+
+        if constexpr (!is_a_8bit) {
+          matmul(k, pipe - (k >= b_sh_wr_iters - 2 ? 1 : 0));
+        } else {
+          static_assert(group_blocks != 0 && group_blocks != 1);
+          matmul_a8(k);
+        }
       }
       slice_iters--;
       if (slice_iters == 0) {
@@ -1668,13 +1956,47 @@ __global__ void Marlin(
     // While this pattern may not be the most readable, other ways of writing
     // the loop seemed to noticeably worse performance after compilation.
     if (slice_iters == 0) {
+      if constexpr (is_a_8bit) {
+        float frag_a_s[2 * thread_m_blocks];
+
+        for (int i = 0; i < 2 * thread_m_blocks; i++)
+          frag_a_s[i] = sh_a_s[i * 8 + (threadIdx.x % 32) / 4];
+
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][0][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][0][g] = c_val * s_val;
+            }
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float c_val = frag_c[i][j][1][g];
+
+              if constexpr (a_type == vllm::kS8) {
+                c_val = __int2float_rn(*reinterpret_cast<int32_t*>(&c_val));
+              }
+              float s_val = frag_a_s[i * 2 + g / 2];
+              frag_c[i][j][1][g] = c_val * s_val;
+            }
+          }
+        }
+      }
+
       cp_async_wait<0>();
       bool last = slice_idx == slice_count - 1;
       // For per-column scales, we only fetch them here in the final step before
       // write-out
       if constexpr (!has_act_order && group_blocks == -1 &&
                     (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+        if (b_type.size_bits() == 8 || (last || use_atomic_add) || is_a_8bit) {
           if (s_sh_wr_pred) {
             cp_async4(&sh_s[s_sh_wr], &scales_ptr[s_gl_rd]);
           }
@@ -1692,20 +2014,27 @@ __global__ void Marlin(
       }
 
       if constexpr (!has_act_order && group_blocks == -1 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
+                    (has_zp && dequant_skip_flop || !has_zp || is_a_8bit)) {
+        if constexpr (is_a_8bit) {
           cp_async_wait<0>();
           __syncthreads();
-          if (threadIdx.x / 32 < thread_n_blocks / 4) {
+          if (threadIdx.x / 32 < tb_n_warps) {
+            reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
+          }
+        } else if (b_type.size_bits() == 8 || (last || use_atomic_add)) {
+          cp_async_wait<0>();
+          __syncthreads();
+          if (threadIdx.x / 32 < tb_n_warps) {
             reinterpret_cast<int4*>(&frag_s)[0] = sh_s[s_sh_rd + 0];
             reinterpret_cast<int4*>(&frag_s)[1] = sh_s[s_sh_rd + 4];
             if constexpr (m_block_size_8) {
               int idx = (threadIdx.x / 4) % 2;
-              scalar_t2* frag_s_half2 = reinterpret_cast<scalar_t2*>(frag_s);
+              c_scalar_t2* frag_s_half2 =
+                  reinterpret_cast<c_scalar_t2*>(frag_s);
   #pragma unroll
               for (int i = 0; i < 8; i++) {
-                frag_s_half2[i] = Dtype::num2num2(
-                    reinterpret_cast<scalar_t*>(&frag_s_half2[i])[idx]);
+                frag_s_half2[i] = Cdtype::num2num2(
+                    reinterpret_cast<c_scalar_t*>(&frag_s_half2[i])[idx]);
               }
             }
           }
@@ -1715,26 +2044,48 @@ __global__ void Marlin(
       // For 8-bit channelwise, we apply the scale before the global reduction
       // that converts the fp32 results to fp16 (so that we avoid possible
       // overflow in fp16)
-      if constexpr (!has_act_order && group_blocks == -1 &&
-                    w_type.size_bits() == 8 &&
-                    (has_zp && dequant_skip_flop || !has_zp)) {
-        if (threadIdx.x / 32 < thread_n_blocks / 4) {
+      if constexpr (!has_act_order && group_blocks == -1 && is_a_8bit) {
+  #pragma unroll
+        for (int j = 0; j < 2; j++) {
+          float2 aa[2];
+          aa[0] = Cdtype::num22float2(frag_s[0][j * 2][0]);
+          aa[1] = Cdtype::num22float2(frag_s[0][j * 2 + 1][0]);
+
+  #pragma unroll
+          for (int i = 0; i < thread_m_blocks; i++) {
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[0])[g % 2];
+              frag_c[i][j][0][g] *= scale;
+            }
+
+  #pragma unroll
+            for (int g = 0; g < 4; g++) {
+              float scale = reinterpret_cast<float*>(&aa[1])[g % 2];
+              frag_c[i][j][1][g] *= scale;
+            }
+          }
+        }
+      } else if (!has_act_order && group_blocks == -1 &&
+                 b_type.size_bits() == 8 &&
+                 (has_zp && dequant_skip_flop || !has_zp)) {
+        if (threadIdx.x / 32 < tb_n_warps) {
   #pragma unroll
           for (int i = 0; i < thread_m_blocks; i++) {
   #pragma unroll
             for (int j = 0; j < 4; j++) {
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][0]),
                   frag_s[j / 2][2 * (j % 2) + 0]);
-              scale_float<scalar_t>(
+              scale_float<c_type_id>(
                   reinterpret_cast<float*>(&frag_c[i][j][0][2]),
                   frag_s[j / 2][2 * (j % 2) + (m_block_size_8 ? 1 : 0)]);
 
               if constexpr (!m_block_size_8) {
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][0]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
-                scale_float<scalar_t>(
+                scale_float<c_type_id>(
                     reinterpret_cast<float*>(&frag_c[i][j][1][2]),
                     frag_s[j / 2][2 * (j % 2) + 1]);
               }
@@ -1758,7 +2109,8 @@ __global__ void Marlin(
         cp_async_wait<0>();
         __syncthreads();
         reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
-        reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
+        if constexpr (!is_a_8bit)
+          reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
         __syncthreads();
       }
 
@@ -1768,21 +2120,22 @@ __global__ void Marlin(
         // only the last block in a slice actually writes the result
         write_result(last);
       slice_row = 0;
-      slice_col_par++;
-      slice_col++;
+      if (!in_part2) {
+        slice_col_par += gridDim.x;
+      } else {
+        slice_col_par++;
+        slice_col++;
+      }
       is_first_matmul_in_slice = true;
       init_slice();
 
       if (slice_iters) {
         a_gl_rd = a_gl_stride * (threadIdx.x / a_gl_rd_delta_o) +
                   (threadIdx.x % a_gl_rd_delta_o);
-  #pragma unroll
-        for (int i = 0; i < b_sh_wr_iters; i++)
-          B_ptr[i] += b_sh_stride - b_gl_rd_delta_o * k_tiles;
-        if (slice_col == 0) {
-  #pragma unroll
-          for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
-        }
+        a_gl_rd += a_gl_rd_delta_o * slice_row;
+        b_gl_rd = b_gl_stride * (threadIdx.x / b_sh_stride) +
+                  (threadIdx.x % b_sh_stride);
+        b_gl_rd += b_sh_stride * slice_col + b_gl_rd_delta_o * slice_row;
 
         bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
         // Update slice k/n for scales loading
@@ -1791,12 +2144,28 @@ __global__ void Marlin(
           slice_k_finish = slice_k_start + tb_k * slice_iters;
           slice_k_start_shared_fetch = slice_k_start;
           slice_n_offset = act_s_col_tb_stride * slice_col;
-
         } else {
-          s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
-          zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          if constexpr (group_blocks == -1) {
+            s_gl_rd = s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd = zp_sh_stride * slice_col + threadIdx.x;
+          } else if constexpr (group_blocks >= thread_k_blocks) {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                s_sh_stride * slice_col + threadIdx.x;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks) +
+                zp_sh_stride * slice_col + threadIdx.x;
+          } else {
+            s_gl_rd =
+                s_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                               threadIdx.x / s_sh_stride) +
+                s_sh_stride * slice_col + threadIdx.x % s_sh_stride;
+            zp_gl_rd =
+                zp_gl_stride * ((thread_k_blocks * slice_row) / group_blocks +
+                                threadIdx.x / zp_sh_stride) +
+                zp_sh_stride * slice_col + threadIdx.x % zp_sh_stride;
+          }
         }
-
         start_pipes();
       }
     }
diff --git a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
index 5369d409f9b2..aff11326d78e 100644
--- a/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
+++ b/csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu
@@ -802,7 +802,7 @@ torch::Tensor hadacore_transform(torch::Tensor& x, bool inplace) {
     });
 
     if (numel % 256 != 0) {
-        out = out.index({torch::indexing::Slice(0, numel / had_size)});
+        out = out.narrow(0, 0, numel / had_size);
     }
 
     if (inplace && out.data_ptr() != x.data_ptr()) {
diff --git a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
index 4ff3e65f2b2e..b8433214be1b 100644
--- a/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
+++ b/csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8_dispatch.cuh
@@ -116,6 +116,26 @@ struct sm90_fp8_config_default {
                                ClusterShape, KernelSchedule, EpilogueSchedule>>;
 };
 
+template <typename InType, typename OutType, bool EnableBias>
+struct sm90_fp8_config_M8192_K6144 {
+  // M >= 8192, K >= 6144
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedCooperativeFP8FastAccum;
+  using EpilogueSchedule =
+      typename cutlass::epilogue::TmaWarpSpecializedCooperative;
+  using TileShape = Shape<_256, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm = conditional_t<
+      EnableBias,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogueBias,
+                               TileShape, ClusterShape, KernelSchedule,
+                               EpilogueSchedule>,
+      cutlass_3x_gemm_sm90_fp8<InType, OutType, c3x::ScaledEpilogue, TileShape,
+                               ClusterShape, KernelSchedule, EpilogueSchedule>>;
+};
+
 template <typename InType, typename OutType, bool EnableBias>
 struct sm90_fp8_config_M128 {
   // M in (64, 128]
@@ -273,6 +293,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
   using Cutlass3xGemmDefault =
       typename sm90_fp8_config_default<InType, OutType,
                                        EnableBias>::Cutlass3xGemm;
+  using Cutlass3xGemmM8192_K6144 =
+      typename sm90_fp8_config_M8192_K6144<InType, OutType,
+                                           EnableBias>::Cutlass3xGemm;
   using Cutlass3xGemmM128 =
       typename sm90_fp8_config_M128<InType, OutType, EnableBias>::Cutlass3xGemm;
 
@@ -291,6 +314,7 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
 
   uint32_t const m = a.size(0);
   uint32_t const n = b.size(1);
+  uint32_t const k = a.size(1);
 
   if (m <= 16) {
     // m in [1, 16]
@@ -312,6 +336,9 @@ inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
     // m in (64, 128]
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM128>(
         out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
+  } else if (m >= 8192 && k >= 6144) {
+    return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmM8192_K6144>(
+        out, a, b, a_scales, b_scales, std::forward<EpilogueArgs>(args)...);
   } else {
     // m in (128, inf)
     return cutlass_gemm_caller_sm90_fp8<Cutlass3xGemmDefault>(
diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
index 1001af05ff00..c5012a866931 100644
--- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
@@ -67,9 +67,9 @@ void cutlass_scaled_mm_sm100(torch::Tensor& c, torch::Tensor const& a,
                              std::optional<torch::Tensor> const& bias);
 #endif
 
-#if defined(ENABLE_SCALED_MM_SM90) && ENABLE_SCALED_MM_SM90 ||   \
-    defined(ENABLE_SCALED_MM_SM100) && ENABLE_SCALED_MM_SM100 || \
-    defined(ENABLE_SCALED_MM_SM120) && ENABLE_SCALED_MM_SM120
+#if (defined(ENABLE_CUTLASS_MOE_SM90) && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined(ENABLE_CUTLASS_MOE_SM100) && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined(ENABLE_CUTLASS_MOE_SM120) && ENABLE_CUTLASS_MOE_SM120)
 void get_cutlass_moe_mm_data_caller(
     const torch::Tensor& topk_ids, torch::Tensor& expert_offsets,
     torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2,
@@ -284,8 +284,9 @@ void get_cutlass_moe_mm_data(
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_data_caller(topk_ids, expert_offsets, problem_sizes1,
                                  problem_sizes2, input_permutation,
                                  output_permutation, num_experts, n, k,
@@ -296,7 +297,7 @@ void get_cutlass_moe_mm_data(
       false,
       "No compiled get_cutlass_moe_mm_data: no cutlass_scaled_mm kernel for "
       "CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void get_cutlass_moe_mm_problem_sizes(
@@ -304,8 +305,9 @@ void get_cutlass_moe_mm_problem_sizes(
     torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n,
     const int64_t k, const std::optional<torch::Tensor>& blockscale_offsets) {
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1,
                                           problem_sizes2, num_experts, n, k,
                                           blockscale_offsets);
@@ -315,7 +317,7 @@ void get_cutlass_moe_mm_problem_sizes(
       false,
       "No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
       "kernel for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
@@ -328,8 +330,9 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
   // This function currently gets compiled only if we have a valid cutlass moe
   // mm to run it for.
   int32_t version_num = get_sm_version_num();
-#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
-    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100)
+#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) ||   \
+    (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
+    (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
   get_cutlass_pplx_moe_mm_data_caller(expert_offsets, problem_sizes1,
                                       problem_sizes2, expert_num_tokens,
                                       num_local_experts, padded_m, n, k);
@@ -339,7 +342,7 @@ void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets,
       false,
       "No compiled get_cutlass_pplx_moe_mm_data: no cutlass_scaled_mm kernel "
       "for CUDA device capability: ",
-      version_num, ". Required capability: 90 or 100");
+      version_num, ". Required capability: 90, 100, or 120");
 }
 
 void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index c3ae06a30e3e..914227838558 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -20,18 +20,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
   //
 
-  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
-  // so we need
-  // to override this for many GEMMs with the following tag. Otherwise,
-  // torch.compile will force all input tensors to be contiguous(), which
-  // will break many custom ops that require column-major weight matrices.
-  // This was a bug and PyTorch 2.7 has since fixed this.
-#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
-  #define stride_tag at::Tag::needs_fixed_stride_order
-#else
-  #define stride_tag
-#endif
-
   ops.def(
       "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
       "y_q, Tensor! y_s,"
@@ -75,7 +63,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 
-#ifndef USE_ROCM
   // Merge attn states
   // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
   // can be used to combine partial attention results (in the split-KV case)
@@ -88,7 +75,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    Tensor suffix_output,"
       "    Tensor suffix_lse) -> ()");
   ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
-
+#ifndef USE_ROCM
   ops.def(
       "convert_vertical_slash_indexes("
       "   Tensor! block_count, Tensor! block_offset, "
@@ -241,15 +228,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Quantized GEMM for AWQ.
   ops.def(
       "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters) -> Tensor",
-      {stride_tag});
+      "Tensor _zeros, SymInt split_k_iters) -> Tensor");
   ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
 
   // Dequantization for AWQ.
   ops.def(
       "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
-      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor",
-      {stride_tag});
+      "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
   ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
 
   // Note about marlin kernel 'workspace' arguments:
@@ -271,8 +256,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
       "Tensor b_scales, Tensor workspace, "
       "int b_q_type, "
-      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor",
-      {stride_tag});
+      "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
   //  conditionally compiled so impl in source file
 
   // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
@@ -298,8 +282,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor? channel_scales,"
       "   Tensor? token_scales,"
       "   str?    schedule"
-      ") -> Tensor",
-      {stride_tag});
+      ") -> Tensor");
   ops.def(
       "machete_prepack_B("
       "   Tensor B,"
@@ -315,24 +298,30 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // gptq_marlin Optimized Quantized GEMM for GPTQ.
   ops.def(
       "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
-      "Tensor? b_bias_or_none,"
-      "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
-      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
+      "Tensor? b_bias_or_none,Tensor b_scales, "
+      "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
+      "Tensor? "
+      "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_type_id, "
       "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
-      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
-      {stride_tag});
+      "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
   // conditionally compiled so impl registration is in source file
 
   // gptq_marlin repack from GPTQ.
   ops.def(
       "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
-      "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_k, SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
   // conditionally compiled so impl registrations are in source file
 
   // awq_marlin repack from AWQ.
   ops.def(
       "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
-      "SymInt size_n, int num_bits) -> Tensor");
+      "SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
+  // conditionally compiled so impl registrations are in source file
+
+  // preprocess W-int4A-fp8 weight for marlin kernel
+  ops.def(
+      "marlin_int4_fp8_preprocess(Tensor qweight, "
+      "Tensor? qzeros_or_none, bool inplace) -> Tensor");
   // conditionally compiled so impl registrations are in source file
 
   // CUTLASS w4a8 GEMM
@@ -346,8 +335,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "   Tensor token_scales,"
       "   ScalarType? out_type,"
       "   str?   maybe_schedule"
-      ") -> Tensor",
-      {stride_tag});
+      ") -> Tensor");
   // pack scales
   ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor");
   // encode and reorder weight matrix
@@ -394,24 +382,21 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
       "                      Tensor block_scale_a, Tensor block_scale_b,"
-      "                      Tensor alpha) -> ()",
-      {stride_tag});
+      "                      Tensor alpha) -> ()");
   ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
 
   // cutlass blockwise scaledgroup GEMM
   ops.def(
       "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
       "Tensor scales_a, Tensor scales_b, "
-      "Tensor problem_sizes, Tensor expert_offsets) -> ()",
-      {stride_tag});
+      "Tensor problem_sizes, Tensor expert_offsets) -> ()");
   // conditionally compiled so impl registration is in source file
 
   // cutlass nvfp4 block scaled group GEMM
   ops.def(
       "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
       " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
-      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
-      {stride_tag});
+      " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()");
   // conditionally compiled so impl registration is in source file
 
   // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
@@ -419,8 +404,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "cutlass_scaled_mm(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
-      "                  Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
+      "                  Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
 
   // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
@@ -429,8 +413,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
       "                  Tensor b, Tensor a_scales,"
       "                  Tensor b_scales, Tensor azp_adj,"
-      "                  Tensor? azp, Tensor? bias) -> ()",
-      {stride_tag});
+      "                  Tensor? azp, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
 
   // Check if cutlass scaled_mm is supported for CUDA devices of the given
@@ -449,8 +432,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "               Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
       "               Tensor problem_sizes, Tensor a_strides, "
       "               Tensor b_strides, Tensor c_strides, bool per_act_token, "
-      "               bool per_out_ch) -> ()",
-      {stride_tag});
+      "               bool per_out_ch) -> ()");
   ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
 
   // A function that computes data required to run fused MoE with w8a8 grouped
@@ -464,8 +446,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                        Tensor! problem_sizes1, Tensor! problem_sizes2, "
       "                        Tensor! input_permutation, "
       "                        Tensor! output_permutation, int num_experts, "
-      "                        int n, int k, Tensor? blockscale_offsets) -> ()",
-      {stride_tag});
+      "                        int n, int k, Tensor? blockscale_offsets) -> "
+      "()");
   ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
 
   // A function that computes problem sizes for each expert's multiplication
@@ -476,8 +458,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                                 Tensor! problem_sizes1, "
       "                                 Tensor! problem_sizes2, "
       "                                 int num_experts, int n, int k, "
-      "                                 Tensor? blockscale_offsets) -> ()",
-      {stride_tag});
+      "                                 Tensor? blockscale_offsets) -> ()");
   ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
            &get_cutlass_moe_mm_problem_sizes);
 
@@ -492,8 +473,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                             Tensor! problem_sizes2, "
       "                             Tensor expert_num_tokens, "
       "                             int num_local_experts, int padded_m, "
-      "                             int n, int k) -> ()",
-      {stride_tag});
+      "                             int n, int k) -> ()");
   ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
            &get_cutlass_pplx_moe_mm_data);
 
@@ -517,8 +497,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
       "                         Tensor bt_nzs,"
       "                         Tensor bt_meta, Tensor a_scales,"
-      "                         Tensor b_scales, Tensor? bias) -> ()",
-      {stride_tag});
+      "                         Tensor b_scales, Tensor? bias) -> ()");
   ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
 
   // CUTLASS sparse matrix compressor
@@ -567,8 +546,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
       "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
       "use_v2_format, int bit) "
-      "-> Tensor",
-      {stride_tag});
+      "-> Tensor");
   ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
 
   // Post processing for GPTQ.
@@ -723,7 +701,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
   cache_ops.def(
       "gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
       "                               Tensor block_table, Tensor cu_seq_lens, "
-      "                               int batch_size, "
+      "                               Tensor token_to_seq, "
+      "                               int num_tokens, "
       "                               str kv_cache_dtype, "
       "                               Tensor scale, Tensor? seq_starts) -> ()");
   cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 964700e2a43a..eb7c105071c0 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,8 +20,8 @@ ARG PYTHON_VERSION=3.12
 # glibc version is baked into the distro, and binaries built with one glibc
 # version are not backwards compatible with OSes that use an earlier version.
 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
-# TODO: Restore to base image after FlashInfer AOT wheel fixed
-ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
+ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
 
 # By parameterizing the Deadsnakes repository URL, we allow third-party to use
 # their own mirror. When doing so, we don't benefit from the transparent
@@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 
 # PyTorch provides its own indexes for standard and nightly builds
 ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
 
 # PIP supports multiple authentication schemes, including keyring
 # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
@@ -86,7 +85,20 @@ ARG GET_PIP_URL
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo python3-pip \
+    && apt-get install -y --no-install-recommends \
+        ccache \
+        software-properties-common \
+        git \
+        curl \
+        sudo \
+        python3-pip \
+        libibverbs-dev \
+        # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
+        # as it was causing spam when compiling the CUTLASS kernels
+        gcc-10 \
+        g++-10 \
+    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
+    && rm -rf /var/lib/apt/lists/* \
     && curl -LsSf https://astral.sh/uv/install.sh | sh \
     && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
     && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
@@ -98,7 +110,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Activate virtual environment and add uv to PATH
@@ -112,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
-# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-# as it was causing spam when compiling the CUTLASS kernels
-RUN apt-get install -y gcc-10 g++-10
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
 RUN <<EOF
 gcc --version
 EOF
@@ -226,6 +233,28 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+# Install DeepGEMM from source
+ARG DEEPGEMM_GIT_REF
+COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
+
+# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
+RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
+
+COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
+# Install EP kernels(pplx-kernels and DeepEP)
+ARG PPLX_COMMIT_HASH
+ARG DEEPEP_COMMIT_HASH
+RUN --mount=type=cache,target=/root/.cache/uv \
+    export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
+    /tmp/install_python_libraries.sh \
+        --workspace /tmp/ep_kernels_workspace \
+        --mode wheel \
+        ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
+        ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
+    find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
@@ -254,7 +283,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 ENV UV_LINK_MODE=copy
 
 # Install libnuma-dev, required by fastsafetensors (fixes #20384)
-RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
 COPY requirements/lint.txt requirements/lint.txt
 COPY requirements/test.txt requirements/test.txt
 COPY requirements/dev.txt requirements/dev.txt
@@ -291,8 +320,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
-    && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
+    && apt-get install -y --no-install-recommends \
+        software-properties-common \
+        curl \
+        sudo \
+        python3-pip \
+        ffmpeg \
+        libsm6 \
+        libxext6 \
+        libgl1 \
     && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
         if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
             mkdir -p -m 0755 /etc/apt/keyrings ; \
@@ -307,17 +343,33 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
         done ; \
     fi \
     && apt-get update -y \
-    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
+    && apt-get install -y --no-install-recommends \
+        python${PYTHON_VERSION} \
+        python${PYTHON_VERSION}-dev \
+        python${PYTHON_VERSION}-venv \
+        libibverbs-dev \
+    && rm -rf /var/lib/apt/lists/* \
     && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
     && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
     && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
     && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
+# Install CUDA development tools and build essentials for runtime JIT compilation
+# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
+RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
+    apt-get update -y && \
+    apt-get install -y --no-install-recommends \
+    cuda-nvcc-${CUDA_VERSION_DASH} \
+    cuda-cudart-${CUDA_VERSION_DASH} \
+    cuda-nvrtc-${CUDA_VERSION_DASH} \
+    cuda-cuobjdump-${CUDA_VERSION_DASH} \
+    libcublas-${CUDA_VERSION_DASH} && \
+    rm -rf /var/lib/apt/lists/*
+
 ARG PIP_INDEX_URL UV_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 ARG PYTORCH_CUDA_INDEX_BASE_URL
-ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
 ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
 
 # Install uv for faster pip installs
@@ -337,20 +389,6 @@ ENV UV_LINK_MODE=copy
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# arm64 (GH200) build follows the practice of "use existing pytorch" build,
-# we need to install torch and torchvision from the nightly builds first,
-# pytorch will not appear as a vLLM dependency in all of the following steps
-# after this step
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            "torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
-        uv pip install --system \
-            --index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
-            --pre pytorch_triton==3.3.0+gitab727c40 ; \
-    fi
-
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/uv \
@@ -360,8 +398,8 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer pre-compiled kernel cache and binaries
 # https://docs.flashinfer.ai/installation.html
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system flashinfer-cubin==0.5.2 \
-    && uv pip install --system flashinfer-jit-cache==0.5.2 \
+    uv pip install --system flashinfer-cubin==0.5.3 \
+    && uv pip install --system flashinfer-jit-cache==0.5.3 \
         --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
     && flashinfer show-config
 
@@ -373,36 +411,32 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
 
-# Even when we build Flashinfer with AOT mode, there's still
-# some issues w.r.t. JIT compilation. Therefore we need to
-# install build dependencies for JIT compilation.
-# TODO: Remove this once FlashInfer AOT wheel is fixed
-COPY requirements/build.txt requirements/build.txt
+# Install deepgemm wheel that has been built in the `build` stage
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt \
+    --mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
+    sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
+              uv pip install --system /tmp/deepgemm/dist/*.whl; \
+           else \
+              echo "No DeepGEMM wheels to install; skipping."; \
+           fi'
+
+# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
+RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system ep_kernels/dist/*.whl --verbose \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
 
-# Install DeepGEMM from source
-ARG DEEPGEMM_GIT_REF
-COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
-RUN --mount=type=cache,target=/root/.cache/uv \
-    VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
-
-COPY tools/install_gdrcopy.sh install_gdrcopy.sh
-RUN set -eux; \
+RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
+    set -eux; \
     case "${TARGETPLATFORM}" in \
       linux/arm64) UUARCH="aarch64" ;; \
       linux/amd64) UUARCH="x64" ;; \
       *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
     esac; \
-    ./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \
-    rm ./install_gdrcopy.sh
-
-# Install EP kernels(pplx-kernels and DeepEP)
-COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
-ENV CUDA_HOME=/usr/local/cuda
-RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
-    && bash install_python_libraries.sh
+    /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
 
 # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
 # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
@@ -432,6 +466,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
+    && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
+    && apt-get update -y \
+    && apt-get install -y git
+
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
     CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
@@ -472,12 +511,11 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
 # Reference: https://github.com/astral-sh/uv/pull/1694
 ENV UV_HTTP_TIMEOUT=500
 
-COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
-
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
     if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
-        uv pip install --system -r requirements/kv_connectors.txt; \
+        uv pip install --system -r /tmp/kv_connectors.txt; \
     fi; \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         BITSANDBYTES_VERSION="0.42.0"; \
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 4c961defaeda..8d55ecfba3e5 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -37,6 +37,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
+ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
@@ -122,7 +123,16 @@ WORKDIR /workspace/vllm
 RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
     cp requirements/test.in requirements/cpu-test.in && \
     sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
-    sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
+    remove_packages_not_supported_on_aarch64() { \
+      case "$(uname -m)" in \
+        aarch64|arm64) \
+          sed -i '/decord/d' requirements/cpu-test.in; \
+          sed -i '/terratorch/d' requirements/cpu-test.in; \
+          ;; \
+      esac; \
+    }; \
+    remove_packages_not_supported_on_aarch64 && \
+    sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
     sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
     sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
     uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
diff --git a/docker/Dockerfile.nightly_torch b/docker/Dockerfile.nightly_torch
index b88b9c499220..d663c82c3885 100644
--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
 uv pip install --system -r requirements/common.txt
 
-# must put before installing xformers, so it can install the correct version of xfomrers.
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-# Build xformers with cuda and torch nightly
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# todo(elainewy): cache xformers build result for faster build
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-     --mount=type=cache,target=/root/.cache/uv \
-    echo 'git clone xformers...' \
-    && git clone https://github.com/facebookresearch/xformers.git --recursive \
-    && cd xformers \
-    && git checkout ${XFORMERS_COMMIT} \
-    && git submodule update --init --recursive \
-    && echo 'finish git clone xformers...' \
-    && rm -rf build \
-    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
-    && cd .. \
-    && rm -rf xformers
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl --verbose
-
 # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
     --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system vllm-dist/*.whl --verbose
 
-# install xformers again for the new environment
-RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
-
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 
 # install package for build flashinfer
@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system -r requirements/nightly_torch_test.txt
 
 # Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
+RUN pip freeze | grep -E 'torch|vllm|flashinfer'
 
 # Logging to confirm all the packages are installed
 RUN pip freeze
diff --git a/docker/Dockerfile.ppc64le b/docker/Dockerfile.ppc64le
index ad9eae94b83d..b16bea3607d2 100644
--- a/docker/Dockerfile.ppc64le
+++ b/docker/Dockerfile.ppc64le
@@ -8,8 +8,8 @@ FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openbl
 
 ARG MAX_JOBS
 ARG OPENBLAS_VERSION=0.3.30
-RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
-    && source /opt/rh/gcc-toolset-13/enable \
+RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
+    && source /opt/rh/gcc-toolset-14/enable \
     && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
     && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
     && cd OpenBLAS-$OPENBLAS_VERSION \
@@ -57,7 +57,7 @@ COPY --from=openblas-builder /tmp/control /dev/null
 RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     dnf install -y openssl-devel \
     && dnf install -y \
-       git tar gcc-toolset-13 automake libtool \
+       git tar gcc-toolset-14 automake libtool \
        pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
        libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
        harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
@@ -84,7 +84,7 @@ ARG _GLIBCXX_USE_CXX11_ABI=1
 ARG OPENBLAS_VERSION=0.3.30
 
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable &&  \
+    source /opt/rh/gcc-toolset-14/enable &&  \
     git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
     cd pytorch && \
     uv pip install -r requirements.txt && \
@@ -97,7 +97,7 @@ ARG TORCHVISION_VERSION=0.22.0
 ARG TORCHVISION_USE_NVJPEG=0
 ARG TORCHVISION_USE_FFMPEG=0
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
     cd vision && \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
@@ -113,7 +113,7 @@ ARG USE_ROCM=0
 ARG USE_CUDA=0
 ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
     cd audio && \
     MAX_JOBS=${MAX_JOBS:-$(nproc)} \
@@ -130,7 +130,7 @@ ARG MAX_JOBS
 ARG PYARROW_PARALLEL
 ARG PYARROW_VERSION=21.0.0
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
     cd arrow/cpp && \
     mkdir build && cd build && \
@@ -162,7 +162,7 @@ ARG OPENCV_VERSION=86
 ARG OPENCV_PATCH=97f3f39
 ARG ENABLE_HEADLESS=1
 RUN --mount=type=cache,target=/root/.cache/uv \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
     cd opencv-python && \
     sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
@@ -196,7 +196,7 @@ ARG MAX_JOBS
 ARG NUMBA_VERSION=0.61.2
 
 # Clone all required dependencies
-RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
+RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-14/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
     git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
     cd ./numba && \
     if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
@@ -211,6 +211,9 @@ RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset
 
 FROM base-builder AS vllmcache-builder
 
+ENV LLVM_CONFIG=/usr/lib64/llvm15/bin/llvm-config
+ENV PATH=/usr/lib64/llvm15/bin:$PATH
+
 COPY --from=torch-builder /tmp/control /dev/null
 COPY --from=arrow-builder /tmp/control /dev/null
 COPY --from=cv-builder /tmp/control /dev/null
@@ -225,10 +228,13 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
 RUN --mount=type=cache,target=/root/.cache/uv \
     dnf install llvm15 llvm15-devel -y && \
     rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
     uv pip install maturin && \
     uv build --wheel --out-dir /hf_wheels/
+
+ENV CXXFLAGS="-fno-lto -Wno-error=free-nonheap-object" \
+    CFLAGS="-fno-lto"
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
     --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
@@ -236,7 +242,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
     --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
     --mount=type=bind,src=.,dst=/src/,rw \
-    source /opt/rh/gcc-toolset-13/enable && \
+    source /opt/rh/gcc-toolset-14/enable && \
     export PATH=$PATH:/usr/lib64/llvm15/bin && \
     uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
     sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
@@ -260,7 +266,7 @@ FROM base-builder AS lapack-builder
 ARG MAX_JOBS
 ARG LAPACK_VERSION=3.12.1
 RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
-    && cd lapack && source /opt/rh/gcc-toolset-13/enable \
+    && cd lapack && source /opt/rh/gcc-toolset-14/enable \
     && cmake -B build -S . \
     && cmake --build build -j ${MAX_JOBS:-$(nproc)}
 
@@ -299,7 +305,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
     rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
     microdnf install --nodocs -y \
-    libomp tar findutils openssl llvm15 llvm15-devel \
+    libomp libicu tar findutils openssl llvm15 llvm15-devel \
     pkgconfig xsimd g++ gcc-fortran libsndfile \
     libtiff libjpeg openjpeg2 zlib zeromq \
     freetype lcms2 libwebp tcl tk utf8proc \
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 06d229f315bd..4aabe2661088 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -7,6 +7,8 @@ FROM ${BASE_IMAGE} AS base
 
 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y \
@@ -15,6 +17,17 @@ RUN apt-get update -q -y && apt-get install -q -y \
 # Remove sccache
 RUN python3 -m pip install --upgrade pip
 RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+
+# Install UV
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
+
+# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
+# Reference: https://github.com/astral-sh/uv/pull/1694
+ENV UV_HTTP_TIMEOUT=500
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+# Use copy mode to avoid hardlink failures with Docker cache mounts
+ENV UV_LINK_MODE=copy
+
 ARG COMMON_WORKDIR
 WORKDIR ${COMMON_WORKDIR}
 
@@ -52,6 +65,8 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+# Centralized v1 package - copied to both test and final stages
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
 
 # -----------------------
 # Test vLLM image
@@ -59,13 +74,15 @@ FROM base AS test
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
-    && pip install -U -r requirements/rocm-test.txt \
+    && uv pip install --system -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm-test.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
 
 WORKDIR /vllm-workspace
 ARG COMMON_WORKDIR
@@ -73,10 +90,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
 
 # install development dependencies (for testing)
 RUN cd /vllm-workspace \
-    && rm -rf vllm \
     && python3 -m pip install -e tests/vllm_test_utils \
     && python3 -m pip install pytest-shard
 
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+
+# Copy in the v1 package
+COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# Source code is used in the `python_only_compile.sh` test
+# We hide it inside `src/` so that this source code
+# will not be imported by other tests
+RUN mkdir src && mv vllm src/vllm
+
 # -----------------------
 # Final vLLM image
 FROM base AS final
@@ -89,14 +118,20 @@ RUN case "$(which python3)" in \
             rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
         *) ;; esac
 
-RUN python3 -m pip install --upgrade huggingface-hub[cli]
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system --upgrade huggingface-hub[cli]
 
-# Install vLLM
+# Install vLLM using uv (inherited from base stage)
+# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
     cd /install \
-    && pip install -U -r requirements/rocm.txt \
+    && uv pip install --system -r requirements/rocm.txt \
     && pip uninstall -y vllm \
-    && pip install *.whl
+    && uv pip install --system *.whl
+
+# Copy in the v1 package
+COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
 
 ARG COMMON_WORKDIR
 
@@ -105,8 +140,6 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker
 
-ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 # ENV that can improve safe tensor loading, and end-to-end time
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 19f7fa7e1468..a57ee728d924 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,13 +1,15 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
 ARG TRITON_BRANCH="57c693b6"
 ARG TRITON_REPO="https://github.com/ROCm/triton.git"
 ARG PYTORCH_BRANCH="1c57644d"
 ARG PYTORCH_VISION_BRANCH="v0.23.0"
 ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
+ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
+ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
 ARG FA_BRANCH="0e60e394"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="9716b1b8"
+ARG AITER_BRANCH="59bd8ff2"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -19,7 +21,11 @@ ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx11
 ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
 ENV AITER_ROCM_ARCH=gfx942;gfx950
 
+# Required for RCCL in ROCm7.1
+ENV HSA_NO_SCRATCH_RECLAIM=1
+
 ARG PYTHON_VERSION=3.12
+ENV PYTHON_VERSION=${PYTHON_VERSION}
 
 RUN mkdir -p /app
 WORKDIR /app
@@ -42,6 +48,7 @@ RUN apt-get update -y \
     && python3 --version && python3 -m pip --version
 
 RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
+RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
 
 FROM base AS build_triton
 ARG TRITON_BRANCH
@@ -63,11 +70,14 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
 FROM base AS build_pytorch
 ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
+ARG PYTORCH_AUDIO_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_REPO
+
 RUN git clone ${PYTORCH_REPO} pytorch
-RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \
-    pip install -r requirements.txt && git submodule update --init --recursive \
+RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
+    && pip install -r requirements.txt && git submodule update --init --recursive \
     && python3 tools/amd_build/build_amd.py \
     && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
@@ -75,8 +85,15 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
 RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
     && python3 setup.py bdist_wheel --dist-dir=dist \
     && pip install dist/*.whl
+RUN git clone ${PYTORCH_AUDIO_REPO} audio
+RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
+    && git submodule update --init --recursive \
+    && pip install -r requirements.txt \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && pip install dist/*.whl
 RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
-    && cp /app/vision/dist/*.whl /app/install
+    && cp /app/vision/dist/*.whl /app/install \
+    && cp /app/audio/dist/*.whl /app/install
 
 FROM base AS build_fa
 ARG FA_BRANCH
@@ -127,6 +144,8 @@ ARG PYTORCH_BRANCH
 ARG PYTORCH_VISION_BRANCH
 ARG PYTORCH_REPO
 ARG PYTORCH_VISION_REPO
+ARG PYTORCH_AUDIO_BRANCH
+ARG PYTORCH_AUDIO_REPO
 ARG FA_BRANCH
 ARG FA_REPO
 ARG AITER_BRANCH
@@ -138,7 +157,9 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
     && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
+    && echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
     && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
     && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
     && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
-    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
+    && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu
index 4e6ef8f5ca13..adac43c6accb 100644
--- a/docker/Dockerfile.xpu
+++ b/docker/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base
+FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
     libxext6 \
     libgl1 \
     lsb-release \
+    libaio-dev \
     numactl \
     wget \
     vim \
@@ -24,10 +25,14 @@ RUN apt clean && apt-get update -y && \
 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
 
-RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing
+RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
+
+# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
+RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
+RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
+    echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
+    echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
 
-RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
-RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
 SHELL ["bash", "-c"]
 CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
 
@@ -68,9 +73,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 RUN python3 -m pip install -e tests/vllm_test_utils
 
 # install nixl from source code
+ENV NIXL_VERSION=0.7.0
 RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/python3.12/dist-packages/.nixl.mesonpy.libs/plugins/"
 
+# remove torch bundled oneccl to avoid conflicts
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip uninstall oneccl oneccl-devel -y
 
diff --git a/docs/.nav.yml b/docs/.nav.yml
index c103ed476d76..d30c0f12eba4 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -24,14 +24,16 @@ nav:
       - deployment/integrations
     - Training: training
     - Configuration:
-      - configuration/README.md
       - configuration/*
+      - TPU: https://docs.vllm.ai/projects/tpu/en/latest/
     - Models:
       - models/supported_models.md
       - models/generative_models.md
       - models/pooling_models.md
       - models/extensions
-      - Hardware Supported Models: models/hardware_supported_models
+      - Hardware Supported Models:
+        - models/hardware_supported_models/*
+        - TPU: https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/
     - Features: features
   - Developer Guide:
     - contributing/README.md
@@ -46,7 +48,15 @@ nav:
       - contributing/model/multimodal.md
       - contributing/model/transcription.md
     - CI: contributing/ci
-    - Design Documents: design
+    - Design Documents:
+      - Plugins:
+        - design/*plugin*.md
+      - design/*
+  - Benchmarking:
+      - benchmarking/README.md
+      - benchmarking/cli.md
+      - benchmarking/sweeps.md
+      - benchmarking/dashboard.md
   - API Reference:
     - api/README.md
     - api/vllm
diff --git a/docs/README.md b/docs/README.md
index 0608794e7e65..0c279c19f96c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
 Where to get started with vLLM depends on the type of user. If you are looking to:
 
 - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
-- Build applications with vLLM, we recommend starting with the [User Guide](./usage)
-- Build vLLM, we recommend starting with [Developer Guide](./contributing)
+- Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
+- Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
 
 For information about the development of vLLM, see:
 
diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
index f8c104ba1425..b327eb2151f5 100644
Binary files a/docs/assets/contributing/dockerfile-stages-dependency.png and b/docs/assets/contributing/dockerfile-stages-dependency.png differ
diff --git a/docs/benchmarking/README.md b/docs/benchmarking/README.md
new file mode 100644
index 000000000000..238290d4762b
--- /dev/null
+++ b/docs/benchmarking/README.md
@@ -0,0 +1,7 @@
+# Benchmark Suites
+
+vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
+
+- **[Benchmark CLI](./cli.md)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing.
+- **[Parameter Sweeps](./sweeps.md)**: Automate `vllm bench` runs for multiple configurations, useful for [optimization and tuning](../configuration/optimization.md).
+- **[Performance Dashboard](./dashboard.md)**: Automated CI that publishes benchmarks on each commit.
diff --git a/docs/contributing/benchmarks.md b/docs/benchmarking/cli.md
similarity index 72%
rename from docs/contributing/benchmarks.md
rename to docs/benchmarking/cli.md
index dca01eab5b42..44a4c4012595 100644
--- a/docs/contributing/benchmarks.md
+++ b/docs/benchmarking/cli.md
@@ -1,24 +1,10 @@
----
-toc_depth: 4
----
+# Benchmark CLI
 
-# Benchmark Suites
+This section guides you through running benchmark tests with the extensive datasets supported on vLLM.
 
-vLLM provides comprehensive benchmarking tools for performance testing and evaluation:
+It's a living document, updated as new features and datasets become available.
 
-- **[Benchmark CLI](#benchmark-cli)**: `vllm bench` CLI tools and specialized benchmark scripts for interactive performance testing
-- **[Parameter sweeps](#parameter-sweeps)**: Automate `vllm bench` runs for multiple configurations
-- **[Performance benchmarks](#performance-benchmarks)**: Automated CI benchmarks for development
-
-[Benchmark CLI]: #benchmark-cli
-
-## Benchmark CLI
-
-This section guides you through running benchmark tests with the extensive
-datasets supported on vLLM. It's a living document, updated as new features and datasets
-become available.
-
-### Dataset Overview
+## Dataset Overview
 
 <style>
 th {
@@ -61,9 +47,9 @@ Legend:
     --dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
     ```
 
-### Examples
+## Examples
 
-#### 🚀 Online Benchmark
+### 🚀 Online Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@@ -114,7 +100,7 @@ P99 ITL (ms):                            8.39
 ==================================================
 ```
 
-##### Custom Dataset
+#### Custom Dataset
 
 If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 
@@ -147,7 +133,7 @@ vllm bench serve --port 9001 --save-result --save-detailed \
 
 You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 
 ```bash
 # need a model with vision capability here
@@ -165,7 +151,7 @@ vllm bench serve \
   --num-prompts 1000
 ```
 
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -182,7 +168,7 @@ vllm bench serve \
     --num-prompts 2048
 ```
 
-##### Spec Bench Benchmark with Speculative Decoding
+#### Spec Bench Benchmark with Speculative Decoding
 
 ``` bash
 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@@ -219,7 +205,7 @@ vllm bench serve \
     --spec-bench-category "summarization"
 ```
 
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 
 ```bash
 vllm serve Qwen/Qwen2-VL-7B-Instruct
@@ -285,7 +271,7 @@ vllm bench serve \
     --blazedit-max-distance 0.99
 ```
 
-##### Running With Sampling Parameters
+#### Running With Sampling Parameters
 
 When using OpenAI-compatible backends such as `vllm`, optional sampling
 parameters can be specified. Example client command:
@@ -303,7 +289,7 @@ vllm bench serve \
   --num-prompts 10
 ```
 
-##### Running With Ramp-Up Request Rate
+#### Running With Ramp-Up Request Rate
 
 The benchmark tool also supports ramping up the request rate over the
 duration of the benchmark run. This can be useful for stress testing the
@@ -320,11 +306,11 @@ The following arguments can be used to control the ramp-up:
 - `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 - `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 
-##### Load Pattern Configuration
+#### Load Pattern Configuration
 
 vLLM's benchmark serving script provides sophisticated load pattern simulation capabilities through three key parameters that control request generation and concurrency behavior:
 
-###### Load Pattern Control Parameters
+##### Load Pattern Control Parameters
 
 - `--request-rate`: Controls the target request generation rate (requests per second). Set to `inf` for maximum throughput testing or finite values for controlled load simulation.
 - `--burstiness`: Controls traffic variability using a Gamma distribution (range: > 0). Lower values create bursty traffic, higher values create uniform traffic.
@@ -389,7 +375,7 @@ Using KV cache metrics for load pattern configuration:
 
 </details>
 
-#### 📈 Offline Throughput Benchmark
+### 📈 Offline Throughput Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
@@ -410,7 +396,7 @@ Total num prompt tokens:  5014
 Total num output tokens:  1500
 ```
 
-##### VisionArena Benchmark for Vision Language Models
+#### VisionArena Benchmark for Vision Language Models
 
 ```bash
 vllm bench throughput \
@@ -430,7 +416,7 @@ Total num prompt tokens:  14527
 Total num output tokens:  1280
 ```
 
-##### InstructCoder Benchmark with Speculative Decoding
+#### InstructCoder Benchmark with Speculative Decoding
 
 ``` bash
 VLLM_WORKER_MULTIPROC_METHOD=spawn \
@@ -453,7 +439,7 @@ Total num prompt tokens:  261136
 Total num output tokens:  204800
 ```
 
-##### Other HuggingFaceDataset Examples
+#### Other HuggingFaceDataset Examples
 
 `lmms-lab/LLaVA-OneVision-Data`:
 
@@ -511,20 +497,20 @@ vllm bench throughput \
 
 </details>
 
-#### 🛠️ Structured Output Benchmark
+### 🛠️ Structured Output Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of structured output generation (JSON, grammar, regex).
 
-##### Server Setup
+#### Server Setup
 
 ```bash
 vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 ```
 
-##### JSON Schema Benchmark
+#### JSON Schema Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -536,7 +522,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Grammar-based Generation Benchmark
+#### Grammar-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -548,7 +534,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Regex-based Generation Benchmark
+#### Regex-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -559,7 +545,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### Choice-based Generation Benchmark
+#### Choice-based Generation Benchmark
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -570,7 +556,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
   --num-prompts 1000
 ```
 
-##### XGrammar Benchmark Dataset
+#### XGrammar Benchmark Dataset
 
 ```bash
 python3 benchmarks/benchmark_serving_structured_output.py \
@@ -583,14 +569,14 @@ python3 benchmarks/benchmark_serving_structured_output.py \
 
 </details>
 
-#### 📚 Long Document QA Benchmark
+### 📚 Long Document QA Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of long document question-answering with prefix caching.
 
-##### Basic Long Document QA Test
+#### Basic Long Document QA Test
 
 ```bash
 python3 benchmarks/benchmark_long_document_qa_throughput.py \
@@ -602,7 +588,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
   --repeat-count 5
 ```
 
-##### Different Repeat Modes
+#### Different Repeat Modes
 
 ```bash
 # Random mode (default) - shuffle prompts randomly
@@ -635,14 +621,14 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
 
 </details>
 
-#### 🗂️ Prefix Caching Benchmark
+### 🗂️ Prefix Caching Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the efficiency of automatic prefix caching.
 
-##### Fixed Prompt with Prefix Caching
+#### Fixed Prompt with Prefix Caching
 
 ```bash
 python3 benchmarks/benchmark_prefix_caching.py \
@@ -653,7 +639,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
   --input-length-range 128:256
 ```
 
-##### ShareGPT Dataset with Prefix Caching
+#### ShareGPT Dataset with Prefix Caching
 
 ```bash
 # download dataset
@@ -684,14 +670,14 @@ vllm bench serve \
 
 </details>
 
-#### ⚡ Request Prioritization Benchmark
+### ⚡ Request Prioritization Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of request prioritization in vLLM.
 
-##### Basic Prioritization Test
+#### Basic Prioritization Test
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -702,7 +688,7 @@ python3 benchmarks/benchmark_prioritization.py \
   --scheduling-policy priority
 ```
 
-##### Multiple Sequences per Prompt
+#### Multiple Sequences per Prompt
 
 ```bash
 python3 benchmarks/benchmark_prioritization.py \
@@ -716,14 +702,14 @@ python3 benchmarks/benchmark_prioritization.py \
 
 </details>
 
-#### 👁️ Multi-Modal Benchmark
+### 👁️ Multi-Modal Benchmark
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
 Benchmark the performance of multi-modal requests in vLLM.
 
-##### Images (ShareGPT4V)
+#### Images (ShareGPT4V)
 
 Start vLLM:
 
@@ -749,7 +735,7 @@ vllm bench serve \
   --endpoint /v1/chat/completions
 ```
 
-##### Videos (ShareGPT4Video)
+#### Videos (ShareGPT4Video)
 
 Start vLLM:
 
@@ -775,7 +761,7 @@ vllm bench serve \
   --endpoint /v1/chat/completions
 ```
 
-##### Synthetic Random Images (random-mm)
+#### Synthetic Random Images (random-mm)
 
 Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
 
@@ -848,14 +834,14 @@ This should be seen as an edge case, and if this behavior can be avoided by sett
 
 </details>
 
-#### Embedding Benchmark
+### Embedding Benchmark
 
 Benchmark the performance of embedding requests in vLLM.
 
 <details class="admonition abstract" markdown="1">
 <summary>Show more</summary>
 
-##### Text Embeddings
+#### Text Embeddings
 
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--backend openai-embeddings` and `--endpoint /v1/embeddings` to use the Embeddings API.
@@ -881,7 +867,7 @@ vllm bench serve \
   --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json
 ```
 
-##### Multi-modal Embeddings
+#### Multi-modal Embeddings
 
 Unlike generative models which use Completions API or Chat Completions API,
 you should set `--endpoint /v1/embeddings` to use the Embeddings API. The backend to use depends on the model:
@@ -946,7 +932,7 @@ vllm bench serve \
 
 </details>
 
-#### Reranker Benchmark
+### Reranker Benchmark
 
 Benchmark the performance of rerank requests in vLLM.
 
@@ -985,227 +971,8 @@ each document has close to 512 tokens.
 
 Please note that the `/v1/rerank` is also supported by embedding models. So if you're running
 with an embedding model, also set `--no_reranker`. Because in this case the query is
-treated as a individual prompt by the server, here we send `random_batch_size - 1` documents
+treated as an individual prompt by the server, here we send `random_batch_size - 1` documents
 to account for the extra prompt which is the query. The token accounting to report the
 throughput numbers correctly is also adjusted.
 
 </details>
-
-## Parameter Sweeps
-
-### Online Benchmark
-
-[`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
-
-Follow these steps to run the script:
-
-1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
-2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
-3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
-
-    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
-
-    ```json
-    [
-        {
-            "max_num_seqs": 32,
-            "max_num_batched_tokens": 1024
-        },
-        {
-            "max_num_seqs": 64,
-            "max_num_batched_tokens": 1024
-        },
-        {
-            "max_num_seqs": 64,
-            "max_num_batched_tokens": 2048
-        },
-        {
-            "max_num_seqs": 128,
-            "max_num_batched_tokens": 2048
-        },
-        {
-            "max_num_seqs": 128,
-            "max_num_batched_tokens": 4096
-        },
-        {
-            "max_num_seqs": 256,
-            "max_num_batched_tokens": 4096
-        }
-    ]
-    ```
-
-4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
-
-    - Example: Using different input/output lengths for random dataset:
-
-    ```json
-    [
-        {
-            "random_input_len": 128,
-            "random_output_len": 32
-        },
-        {
-            "random_input_len": 256,
-            "random_output_len": 64
-        },
-        {
-            "random_input_len": 512,
-            "random_output_len": 128
-        }
-    ]
-    ```
-
-5. Determine where you want to save the results, and pass that to `--output-dir`.
-
-Example command:
-
-```bash
-vllm bench sweep serve \
-    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
-    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
-    --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json \
-    -o benchmarks/results
-```
-
-!!! important
-    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
-    You can use `--dry-run` to preview the commands to be run.
-
-    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
-    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
-    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
-
-!!! note
-    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
-
-!!! tip
-    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
-  
-### SLA Auto-Tuner
-
-[`vllm/benchmarks/sweep/serve_sla.py`](../../vllm/benchmarks/sweep/serve_sla.py) is a wrapper over [`vllm/benchmarks/sweep/serve.py`](../../vllm/benchmarks/sweep/serve.py) that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
-
-For example, to ensure E2E latency within different target values for 99% of requests:
-
-```json
-[
-    {
-        "p99_e2el_ms": "<=200"
-    },
-    {
-        "p99_e2el_ms": "<=500"
-    },
-    {
-        "p99_e2el_ms": "<=1000"
-    },
-    {
-        "p99_e2el_ms": "<=2000"
-    }
-]
-```
-
-Example command:
-
-```bash
-vllm bench sweep serve_sla \
-    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
-    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
-    --serve-params benchmarks/serve_hparams.json \
-    --bench-params benchmarks/bench_hparams.json \
-    --sla-params benchmarks/sla_hparams.json \
-    --sla-variable max_concurrency \
-    -o benchmarks/results
-```
-
-The algorithm for adjusting the SLA variable is as follows:
-
-1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
-    - For example, the initial request rate is set to the concurrency under infinite QPS.
-2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
-3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
-
-!!! important
-    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
-
-    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
-
-### Visualizer
-
-[`vllm/benchmarks/sweep/plot.py`](../../vllm/benchmarks/sweep/plot.py) can be used to plot performance curves from parameter sweep results.
-
-Example command:
-
-```bash
-vllm bench sweep plot benchmarks/results/<timestamp> \
-    --var-x max_concurrency \
-    --row-by random_input_len \
-    --col-by random_output_len \
-    --curve-by api_server_count,max_num_batched_tokens \
-    --filter-by 'max_concurrency<=1024'
-```
-
-!!! tip
-    You can use `--dry-run` to preview the figures to be plotted.
-
-## Performance Benchmarks
-
-The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
-
-### Manually Trigger the benchmark
-
-Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
-For CPU environment, please use the image with "-cpu" postfix.
-
-Here is an example for docker run command for CPU.
-
-```bash
-docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
-```
-
-Then, run below command inside the docker instance.
-
-```bash
-bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
-```
-
-When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
-
-#### Runtime environment variables
-
-- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
-- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
-- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
-- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
-- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
-- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
-
-For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
-
-The latest performance results are hosted on the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
-
-More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
-
-### Continuous Benchmarking
-
-The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
-
-#### How It Works
-
-The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
-
-- **Serving tests**: Measure request handling and API performance
-- **Throughput tests**: Evaluate token generation rates
-- **Latency tests**: Assess response time characteristics
-
-#### Benchmark Configuration
-
-The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
-
-1. Navigate to the appropriate GPU directory in the benchmarks configuration
-2. Add your model specifications to the corresponding configuration files
-3. The new models will be included in the next scheduled benchmark run
-
-#### Viewing Results
-
-All continuous benchmarking results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
diff --git a/docs/benchmarking/dashboard.md b/docs/benchmarking/dashboard.md
new file mode 100644
index 000000000000..7cc4d23250df
--- /dev/null
+++ b/docs/benchmarking/dashboard.md
@@ -0,0 +1,58 @@
+# Performance Dashboard
+
+The performance dashboard is used to confirm whether new changes improve/degrade performance under various workloads.
+It is updated by triggering benchmark runs on every commit with both the `perf-benchmarks` and `ready` labels, and when a PR is merged into vLLM.
+
+The results are automatically published to the public [vLLM Performance Dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm).
+
+## Manually Trigger the benchmark
+
+Use [vllm-ci-test-repo images](https://gallery.ecr.aws/q9t5s3a7/vllm-ci-test-repo) with vLLM benchmark suite.
+For CPU environment, please use the image with "-cpu" postfix.
+
+Here is an example for docker run command for CPU.
+
+```bash
+docker run -it --entrypoint /bin/bash -v /data/huggingface:/root/.cache/huggingface  -e HF_TOKEN=''  --shm-size=16g --name vllm-cpu-ci  public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:1da94e673c257373280026f75ceb4effac80e892-cpu
+```
+
+Then, run below command inside the docker instance.
+
+```bash
+bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+When run, benchmark script generates results under **benchmark/results** folder, along with the benchmark_results.md and benchmark_results.json.
+
+### Runtime environment variables
+
+- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
+- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
+- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
+- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
+- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
+- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
+
+For more results visualization, check the [visualizing the results](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md#visualizing-the-results).
+
+More information on the performance benchmarks and their parameters can be found in [Benchmark README](https://github.com/intel-ai-tce/vllm/blob/more_cpu_models/.buildkite/nightly-benchmarks/README.md) and [performance benchmark description](../../.buildkite/performance-benchmarks/performance-benchmarks-descriptions.md).
+
+## Continuous Benchmarking
+
+The continuous benchmarking provides automated performance monitoring for vLLM across different models and GPU devices. This helps track vLLM's performance characteristics over time and identify any performance regressions or improvements.
+
+### How It Works
+
+The continuous benchmarking is triggered via a [GitHub workflow CI](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) in the PyTorch infrastructure repository, which runs automatically every 4 hours. The workflow executes three types of performance tests:
+
+- **Serving tests**: Measure request handling and API performance
+- **Throughput tests**: Evaluate token generation rates
+- **Latency tests**: Assess response time characteristics
+
+### Benchmark Configuration
+
+The benchmarking currently runs on a predefined set of models configured in the [vllm-benchmarks directory](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks). To add new models for benchmarking:
+
+1. Navigate to the appropriate GPU directory in the benchmarks configuration
+2. Add your model specifications to the corresponding configuration files
+3. The new models will be included in the next scheduled benchmark run
diff --git a/docs/benchmarking/sweeps.md b/docs/benchmarking/sweeps.md
new file mode 100644
index 000000000000..ee4d40d876de
--- /dev/null
+++ b/docs/benchmarking/sweeps.md
@@ -0,0 +1,178 @@
+# Parameter Sweeps
+
+## Online Benchmark
+
+### Basic
+
+`vllm bench sweep serve` automatically starts `vllm serve` and runs `vllm bench serve` to evaluate vLLM over multiple configurations.
+
+Follow these steps to run the script:
+
+1. Construct the base command to `vllm serve`, and pass it to the `--serve-cmd` option.
+2. Construct the base command to `vllm bench serve`, and pass it to the `--bench-cmd` option.
+3. (Optional) If you would like to vary the settings of `vllm serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--serve-params`.
+
+    - Example: Tuning `--max-num-seqs` and `--max-num-batched-tokens`:
+
+    ```json
+    [
+        {
+            "max_num_seqs": 32,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 1024
+        },
+        {
+            "max_num_seqs": 64,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 2048
+        },
+        {
+            "max_num_seqs": 128,
+            "max_num_batched_tokens": 4096
+        },
+        {
+            "max_num_seqs": 256,
+            "max_num_batched_tokens": 4096
+        }
+    ]
+    ```
+
+4. (Optional) If you would like to vary the settings of `vllm bench serve`, create a new JSON file and populate it with the parameter combinations you want to test. Pass the file path to `--bench-params`.
+
+    - Example: Using different input/output lengths for random dataset:
+
+    ```json
+    [
+        {
+            "random_input_len": 128,
+            "random_output_len": 32
+        },
+        {
+            "random_input_len": 256,
+            "random_output_len": 64
+        },
+        {
+            "random_input_len": 512,
+            "random_output_len": 128
+        }
+    ]
+    ```
+
+5. Determine where you want to save the results, and pass that to `--output-dir`.
+
+Example command:
+
+```bash
+vllm bench sweep serve \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    -o benchmarks/results
+```
+
+!!! important
+    If both `--serve-params` and `--bench-params` are passed, the script will iterate over the Cartesian product between them.
+    You can use `--dry-run` to preview the commands to be run.
+
+    We only start the server once for each `--serve-params`, and keep it running for multiple `--bench-params`.
+    Between each benchmark run, we call the `/reset_prefix_cache` and `/reset_mm_cache` endpoints to get a clean slate for the next run.
+    In case you are using a custom `--serve-cmd`, you can override the commands used for resetting the state by setting `--after-bench-cmd`.
+
+!!! note
+    By default, each parameter combination is run 3 times to make the results more reliable. You can adjust the number of runs by setting `--num-runs`.
+
+!!! tip
+    You can use the `--resume` option to continue the parameter sweep if one of the runs failed.
+  
+### SLA auto-tuner
+
+`vllm bench sweep serve_sla` is a wrapper over `vllm bench sweep serve` that tunes either the request rate or concurrency (choose using `--sla-variable`) in order to satisfy the SLA constraints given by `--sla-params`.
+
+For example, to ensure E2E latency within different target values for 99% of requests:
+
+```json
+[
+    {
+        "p99_e2el_ms": "<=200"
+    },
+    {
+        "p99_e2el_ms": "<=500"
+    },
+    {
+        "p99_e2el_ms": "<=1000"
+    },
+    {
+        "p99_e2el_ms": "<=2000"
+    }
+]
+```
+
+Example command:
+
+```bash
+vllm bench sweep serve_sla \
+    --serve-cmd 'vllm serve meta-llama/Llama-2-7b-chat-hf' \
+    --bench-cmd 'vllm bench serve --model meta-llama/Llama-2-7b-chat-hf --backend vllm --endpoint /v1/completions --dataset-name sharegpt --dataset-path benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json' \
+    --serve-params benchmarks/serve_hparams.json \
+    --bench-params benchmarks/bench_hparams.json \
+    --sla-params benchmarks/sla_hparams.json \
+    --sla-variable max_concurrency \
+    -o benchmarks/results
+```
+
+The algorithm for adjusting the SLA variable is as follows:
+
+1. Run the benchmark with infinite QPS, and use the corresponding metrics to determine the initial value of the variable.
+    - For example, the initial request rate is set to the concurrency under infinite QPS.
+2. If the SLA is still satisfied, keep doubling the value until the SLA is no longer satisfied. This gives a relatively narrow window that contains the point where the SLA is barely satisfied.
+3. Apply binary search over the window to find the maximum value that still satisfies the SLA.
+
+!!! important
+    SLA tuning is applied over each combination of `--serve-params`, `--bench-params`, and `--sla-params`.
+
+    For a given combination of `--serve-params` and `--bench-params`, we share the benchmark results across `--sla-params` to avoid rerunning benchmarks with the same SLA variable value.
+
+## Visualization
+
+### Basic
+
+`vllm bench sweep plot` can be used to plot performance curves from parameter sweep results.
+
+Example command:
+
+```bash
+vllm bench sweep plot benchmarks/results/<timestamp> \
+    --var-x max_concurrency \
+    --row-by random_input_len \
+    --col-by random_output_len \
+    --curve-by api_server_count,max_num_batched_tokens \
+    --filter-by 'max_concurrency<=1024'
+```
+
+!!! tip
+    You can use `--dry-run` to preview the figures to be plotted.
+
+### Pareto chart
+
+`vllm bench sweep plot_pareto` helps pick configurations that balance per-user and per-GPU throughput.
+
+Higher concurrency or batch size can raise GPU efficiency (per-GPU), but can add per user latency; lower concurrency improves per-user rate but underutilizes GPUs; The Pareto frontier shows the best achievable pairs across your runs.
+
+- x-axis: tokens/s/user = `output_throughput` ÷ concurrency (`--user-count-var`, default `max_concurrency`, fallback `max_concurrent_requests`).
+- y-axis: tokens/s/GPU = `output_throughput` ÷ GPU count (`--gpu-count-var` if set; else gpu_count is TP×PP*DP).
+- Output: a single figure at `OUTPUT_DIR/pareto/PARETO.png`.
+- Show the configuration used in each data point `--label-by` (default: `max_concurrency,gpu_count`).
+
+Example:
+
+```bash
+vllm bench sweep plot_pareto benchmarks/results/<timestamp> \
+  --label-by max_concurrency,tensor_parallel_size,pipeline_parallel_size
+```
diff --git a/docs/cli/bench/latency.md b/docs/cli/bench/latency.md
index 21ab13e63781..ea7ea7321ffc 100644
--- a/docs/cli/bench/latency.md
+++ b/docs/cli/bench/latency.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_latency.md"
+--8<-- "docs/argparse/bench_latency.inc.md"
diff --git a/docs/cli/bench/serve.md b/docs/cli/bench/serve.md
index f7c415c6becb..f7dc8036cc26 100644
--- a/docs/cli/bench/serve.md
+++ b/docs/cli/bench/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_serve.md"
+--8<-- "docs/argparse/bench_serve.inc.md"
diff --git a/docs/cli/bench/sweep/plot.md b/docs/cli/bench/sweep/plot.md
index f29bffb64655..a101330e093c 100644
--- a/docs/cli/bench/sweep/plot.md
+++ b/docs/cli/bench/sweep/plot.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_plot.md"
+--8<-- "docs/argparse/bench_sweep_plot.inc.md"
diff --git a/docs/cli/bench/sweep/plot_pareto.md b/docs/cli/bench/sweep/plot_pareto.md
new file mode 100644
index 000000000000..f5dc257ce677
--- /dev/null
+++ b/docs/cli/bench/sweep/plot_pareto.md
@@ -0,0 +1,9 @@
+# vllm bench sweep plot_pareto
+
+## JSON CLI Arguments
+
+--8<-- "docs/cli/json_tip.inc.md"
+
+## Arguments
+
+--8<-- "docs/argparse/bench_sweep_plot_pareto.inc.md"
diff --git a/docs/cli/bench/sweep/serve.md b/docs/cli/bench/sweep/serve.md
index 5b5f91a951ed..f0468f06fc28 100644
--- a/docs/cli/bench/sweep/serve.md
+++ b/docs/cli/bench/sweep/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve.md"
+--8<-- "docs/argparse/bench_sweep_serve.inc.md"
diff --git a/docs/cli/bench/sweep/serve_sla.md b/docs/cli/bench/sweep/serve_sla.md
index 5f8ab6005e50..5642ec67eb00 100644
--- a/docs/cli/bench/sweep/serve_sla.md
+++ b/docs/cli/bench/sweep/serve_sla.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_sweep_serve_sla.md"
+--8<-- "docs/argparse/bench_sweep_serve_sla.inc.md"
diff --git a/docs/cli/bench/throughput.md b/docs/cli/bench/throughput.md
index e4ff5ce43c9c..e7f618fb4d14 100644
--- a/docs/cli/bench/throughput.md
+++ b/docs/cli/bench/throughput.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/bench_throughput.md"
+--8<-- "docs/argparse/bench_throughput.inc.md"
diff --git a/docs/cli/chat.md b/docs/cli/chat.md
index b006cb8de60d..0246bd431b10 100644
--- a/docs/cli/chat.md
+++ b/docs/cli/chat.md
@@ -1,5 +1,5 @@
 # vllm chat
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/chat.md"
+--8<-- "docs/argparse/chat.inc.md"
diff --git a/docs/cli/complete.md b/docs/cli/complete.md
index 400359acf4fb..eb2ffdaabac2 100644
--- a/docs/cli/complete.md
+++ b/docs/cli/complete.md
@@ -1,5 +1,5 @@
 # vllm complete
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/complete.md"
+--8<-- "docs/argparse/complete.inc.md"
diff --git a/docs/cli/run-batch.md b/docs/cli/run-batch.md
index f7d401b8dad2..758fbda28397 100644
--- a/docs/cli/run-batch.md
+++ b/docs/cli/run-batch.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/run-batch.md"
+--8<-- "docs/argparse/run-batch.inc.md"
diff --git a/docs/cli/serve.md b/docs/cli/serve.md
index 2c8f9d320f5d..35652fec587b 100644
--- a/docs/cli/serve.md
+++ b/docs/cli/serve.md
@@ -4,6 +4,6 @@
 
 --8<-- "docs/cli/json_tip.inc.md"
 
-## Options
+## Arguments
 
---8<-- "docs/argparse/serve.md"
+--8<-- "docs/argparse/serve.inc.md"
diff --git a/docs/community/meetups.md b/docs/community/meetups.md
index 0735f452df96..d8cf4ecdd5a3 100644
--- a/docs/community/meetups.md
+++ b/docs/community/meetups.md
@@ -10,6 +10,7 @@ Stay tuned for upcoming meetups! Follow us on [Twitter/X](https://x.com/vllm_pro
 
 Below you'll find slides and recordings from our previous meetups:
 
+- [vLLM Bangkok Meetup](https://luma.com/v0f647nv), November 21st 2025. [[Slides]](https://drive.google.com/drive/folders/1H0DS57F8HQ5q3kSOSoRmucPJWL3E0A_X?usp=sharing)
 - [vLLM Zurich Meetup](https://luma.com/0gls27kb), November 6th 2025. [[Slides]](https://docs.google.com/presentation/d/1UC9PTLCHYXQpOmJDSFg6Sljra3iVXzc09DeEI7dnxMc/edit?usp=sharing) [[Recording]](https://www.youtube.com/watch?v=6m6ZE6yVEDI)
 - [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w), November 1st 2025. [[Slides]](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link)
 - [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg), October 25th 2025. [[Slides]](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6)
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
index 5ce43c798405..0aa89a89eae5 100644
--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -49,9 +49,6 @@ llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
 
 By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
 
-!!! warning
-    CUDA graph capture takes up more memory in V1 than in V0.
-
 You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
 
 ??? code
diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md
index 2c0a898754fa..f6d548a19d91 100644
--- a/docs/configuration/env_vars.md
+++ b/docs/configuration/env_vars.md
@@ -7,8 +7,6 @@ vLLM uses the following environment variables to configure the system:
 
     All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
 
-??? code
-
-    ```python
-    --8<-- "vllm/envs.py:env-vars-definition"
-    ```
+```python
+--8<-- "vllm/envs.py:env-vars-definition"
+```
diff --git a/docs/configuration/optimization.md b/docs/configuration/optimization.md
index b0d390d7e1cb..fdd9c317b022 100644
--- a/docs/configuration/optimization.md
+++ b/docs/configuration/optimization.md
@@ -31,9 +31,7 @@ In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as re
 
 Chunked prefill allows vLLM to process large prefills in smaller chunks and batch them together with decode requests. This feature helps improve both throughput and latency by better balancing compute-bound (prefill) and memory-bound (decode) operations.
 
-In vLLM V1, **chunked prefill is always enabled by default**. This is different from vLLM V0, where it was conditionally enabled based on model characteristics.
-
-With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
+In V1, **chunked prefill is enabled by default whenever possible**. With chunked prefill enabled, the scheduling policy prioritizes decode requests. It batches all pending decode requests before scheduling any prefill operations. When there are available tokens in the `max_num_batched_tokens` budget, it schedules pending prefills. If a pending prefill request cannot fit into `max_num_batched_tokens`, it automatically chunks it.
 
 This policy has two benefits:
 
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
index c1cc5577bc7a..baaf21f01f06 100644
--- a/docs/configuration/serve_args.md
+++ b/docs/configuration/serve_args.md
@@ -5,7 +5,7 @@ The `vllm serve` command is used to launch the OpenAI-compatible server.
 ## CLI Arguments
 
 The `vllm serve` command is used to launch the OpenAI-compatible server.
-To see the available options, take a look at the [CLI Reference](../cli/README.md#options)!
+To see the available options, take a look at the [CLI Reference](../cli/README.md)!
 
 ## Configuration file
 
diff --git a/docs/configuration/tpu.md b/docs/configuration/tpu.md
deleted file mode 100644
index 2d24c9c6e2e9..000000000000
--- a/docs/configuration/tpu.md
+++ /dev/null
@@ -1,111 +0,0 @@
-# TPU Optimization Tips
-
-This doc serves as a collection of handy tips for optimizing your vLLM on TPU workload.
-
-## Get started
-
-Looking for setup and installation instructions? Find them [here](https://docs.vllm.ai/projects/tpu/en/latest/getting_started/installation/).
-
-### TPU workload sizing
-
-When selecting the ideal number of chips for a single serving instance, it's important to account for both the model size and the average request context length. Adequate HBM for the KV cache is essential to ensure a sufficient number of concurrent requests can be processed.
-
-The following colab [calculator](https://colab.research.google.com/github/ericehanley/rightsize-vllm/blob/main/HBM_Calculator.ipynb) will tell you:
-
-- KV cache size requirement per token and per request
-- TPU/GPU memory consumed by the model weights
-- TPU/GPU memory allocated for the KV cache
-- Maximum \# of requests you can approximately set (--max-num-seqs)
-
-This approach serves as a general rule of thumb.
-
-#### Latency-throughput tradeoff
-
-As with rightsizing the number of chips for your workload, consider adjusting `--max-num-seqs` to fine-tune the latency-throughput balance. Decreasing `--max-num-seqs` and/or increasing the number of chips can help reduce latency.
-
-`--max-num-seqs` defines the number of concurrent decode slots, effectively limiting the number of requests the server can process tokens for simultaneously. Increasing this value allows the server to pre-allocate more HBM to handle a higher number of concurrent requests, which can maximize overall throughput. However, this often increases the end-to-end (e2e) latency per request.
-
-Therefore, carefully tuning `--max-num-seqs` is crucial to achieving the desired balance between latency and throughput for your specific workload.
-
-In a similar way, `--max-num-batch-tokens` can be adjusted down to improve latency, or adjusted up to improve throughput.
-
-#### Compilation and Caching
-
-Coming from a GPU background, one of the key differences you'll notice with TPUs is an initial compilation step. TPUs are specialized accelerators (ASICs) that achieve maximum performance by executing pre-compiled, static computation graphs via the XLA compiler. Unlike GPUs, which can handle dynamic input shapes more flexibly, TPUs require a specific compiled graph for each tensor shape (e.g., batch size and sequence length) they process.
-
-To manage this, vLLM performs a one-time "warmup" process when you first launch the server. During this phase, it pre-compiles the model for various common input shapes and saves these compiled graphs to a cache on disk or remote storage (located at `~/.cache/vllm/xla_cache` by default). This process can range significantly, anywhere from a few minutes to an hour depending on the size of the model and context length used.
-
-Although the first compilation can take some time, for all subsequent server launches, vLLM can load these graphs directly from the cache, eliminating the compilation time for future runs.
-
-Use `VLLM_XLA_CACHE_PATH` environment variable to write to shareable storage for future deployed nodes (like when using autoscaling).
-
-#### Reducing compilation time
-
-This initial compilation time ranges significantly and is impacted by many of the arguments discussed in this optimization doc. Factors that influence the length of time to compile are things like model size and `--max-num-batch-tokens`. Other arguments you can tune are things like `VLLM_TPU_MOST_MODEL_LEN`.
-
-### Optimize based on your data
-
-#### max-model-len vs. most-model-len
-
-![most_model_len](../assets/design/tpu/most_model_len.png)
-
-If most of your requests are shorter than the maximum model length but you still need to accommodate occasional longer requests, setting a high maximum model length can negatively impact performance. In these cases, you can try introducing most-model-len by specifying the `VLLM_TPU_MOST_MODEL_LEN` environment variable.
-
-For example, 1% requests are 32k length and 99% requests are 2k length. You can pass 32k into `--max-model-len 32768` and use `VLLM_TPU_MOST_MODEL_LEN=2048`.
-
-The requests get subdivided into max-model-len and most-model-len categories, for the latter category, you can gain better performance since the server can process more requests at a time.
-
-#### Padding
-
-For online serving with latency requirements, consider switching to bucket padding by setting the `VLLM_TPU_BUCKET_PADDING_GAP` environment variable. Because of the layout of the TPU, try using increments of 128 (e.g., 128, 256, etc.)
-
-The server pads the requests into fixed lengths before sending them to the model to avoid recompilation. To read more about TPU padding, see [here](https://cloud.google.com/tpu/docs/performance-guide#xla-efficiencies). Currently, there are 2 ways to pad the requests:
-
-1. the default exponential padding (pad to the nearest power of 2)
-2. bucket padding (pad to the nearest linearly increasing bucket).
-
-When using bucket padding, the buckets start from 16, end at max_model_len, and increment by `VLLM_TPU_BUCKET_PADDING_GAP`.
-
-For example, max_model_len=512, padding_gap=64, the buckets will be [16, 32, 64, 128, 192, 256, 320, 384, 448, 512].
-
-The fewer tokens you pad, the less unnecessary computation TPU does, the better performance you can get. For example, if num_tokens=300, with exponential padding, you pad to 512, with the bucket_padding above, you pad to 320.
-
-However, you need to be careful to choose the padding gap. If the gap is too small, it means the number of buckets is large, leading to increased warmup (precompile) time and higher memory to store the compiled graph. Too many compiled graphs may lead to HBM OOM. Conversely, an overly large gap yields no performance improvement compared to the default exponential padding.
-
-#### Quantization
-
-If possible, use the precision that matches the chip’s hardware acceleration:
-
-- v5e has int4/int8 hardware acceleration in the MXU
-- v6e has int4/int8 hardware acceleration in the MXU
-
-Supported quantized formats and features in vLLM on TPU [Jul '25]:
-
-- INT8 W8A8
-- INT8 W8A16
-- FP8 KV cache
-- [WIP] FP8 W8A8
-- [WIP] AWQ
-- [WIP] FP4 W4A8
-
-#### Parallelization
-
-Don't set TP to be less than the number of chips on a single-host deployment.
-
-Although it’s common to do this with GPUs, don't try to fragment 2 or 8 different workloads across 8 chips on a single host. If you need 1 or 4 chips, just create an instance with 1 or 4 chips (these are partial-host machine types).
-
-### Tune your workloads
-
-Although we try to have great default configs, we strongly recommend you check out the [vLLM auto-tuner](../../benchmarks/auto_tune/README.md) to optimize your workloads for your use case.
-
-### Future Topics We'll Cover
-
-#### Profiling
-
-The auto-tuner provides a profile of optimized configurations as its final step. However, interpreting this profile can be challenging for new users. We plan to expand this section in the future with more detailed guidance. In the meantime, you can learn how to collect a TPU profile using vLLM's native profiling tools [here](../examples/offline_inference/profiling_tpu.md). This profile can provide valuable insights into your workload's performance.
-
-#### SPMD
-
-More details to come.
-
-**Want us to cover something that isn't listed here? Open up an issue please and cite this doc. We'd love to hear your questions or tips.**
diff --git a/docs/contributing/ci/update_pytorch_version.md b/docs/contributing/ci/update_pytorch_version.md
index f983c25f26ee..735bb2e20533 100644
--- a/docs/contributing/ci/update_pytorch_version.md
+++ b/docs/contributing/ci/update_pytorch_version.md
@@ -95,24 +95,9 @@ when manually triggering a build on Buildkite. This branch accomplishes two thin
 to warm it up so that future builds are faster.
 
 <p align="center" width="100%">
-    <img width="60%" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
+    <img width="60%" alt="Buildkite new build popup" src="https://github.com/user-attachments/assets/a8ff0fcd-76e0-4e91-b72f-014e3fdb6b94">
 </p>
 
-## Update dependencies
-
-Several vLLM dependencies like xFormers depend on PyTorch and need
-to be updated accordingly. Rather than waiting for all of them to publish new
-releases (which would take too much time), they can be built from
-source to unblock the update process.
-
-### xFormers
-
-```bash
-export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
-MAX_JOBS=16 uv pip install --system \
-    --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.32.post2"
-```
-
 ## Update all the different vLLM platforms
 
 Rather than attempting to update all vLLM platforms in a single pull request, it's more manageable
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
index d8c40c519573..13f3edb7e1af 100644
--- a/docs/contributing/model/README.md
+++ b/docs/contributing/model/README.md
@@ -1,7 +1,7 @@
 # Summary
 
 !!! important
-    Many decoder language models can now be automatically loaded using the [Transformers backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
+    Many decoder language models can now be automatically loaded using the [Transformers modeling backend](../../models/supported_models.md#transformers) without having to implement them in vLLM. See if `vllm serve <model>` works first!
 
 vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/README.md#compatibility-matrix) to optimize their performance.
 
diff --git a/docs/contributing/model/basic.md b/docs/contributing/model/basic.md
index a7b54f015c2d..a68d1f0162a1 100644
--- a/docs/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -29,7 +29,7 @@ The initialization code should look like this:
     ```python
     from torch import nn
     from vllm.config import VllmConfig
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     class MyAttention(nn.Module):
         def __init__(self, vllm_config: VllmConfig, prefix: str):
@@ -133,8 +133,6 @@ We consider 3 different scenarios:
 For case (1), we recommend looking at the implementation of [`MambaForCausalLM`](../../../vllm/model_executor/models/mamba.py) (for Mamba-1) or [`Mamba2ForCausalLM`](../../../vllm/model_executor/models/mamba2.py) (for Mamba-2) as a reference.
 The model should inherit protocol `IsAttentionFree` and also implement class methods `get_mamba_state_dtype_from_config` and `get_mamba_state_shape_from_config` to calculate the state shapes and data types from the config.
 For the mamba layers themselves, please use the [`MambaMixer`](../../../vllm/model_executor/layers/mamba/mamba_mixer.py) (for Mamba-1) or [`MambaMixer2`](../../../vllm/model_executor/layers/mamba/mamba_mixer2.py) (for Mamba-2) classes.
-Please *do not* use the `MambaCacheManager` (deprecated in V1) or replicate any of the V0-specific code paths in the existing model implementations.
-V0-only classes and code will be removed in the very near future.
 The model should also be added to the `MODELS_CONFIG_MAP` dictionary in [vllm/model_executor/models/config.py](../../../vllm/model_executor/models/config.py) to ensure that the runtime defaults are optimized.
 
 For case (2), we recommend using as a reference the implementation of [`JambaForCausalLM`](../../../vllm/model_executor/models/jamba.py) (for an example of a model that uses Mamba-1 and attention together) or [`BambaForCausalLM`](../../../vllm/model_executor/models/bamba.py) (for an example of a model that uses Mamba-2 and attention together).
@@ -146,6 +144,7 @@ We use "mamba-like" to refer to layers that posses a state that is updated in-pl
 For implementing new custom mamba-like layers, one should inherit from `MambaBase` and implement the methods `get_state_dtype`, `get_state_shape` to calculate the data types and state shapes at runtime, as well as `mamba_type` and `get_attn_backend`.
 It is also necessary to implement the "attention meta-data" class which handles the meta-data that is common across all layers.
 Please see [`LinearAttentionMetadata`](../../../vllm/v1/attention/backends/linear_attn.py) or [`ShortConvAttentionMetadata`](../../../vllm/v1/attention/backends/short_conv_attn.py) for examples of this.
+It is also worth noting that we should update `MAMBA_TYPE_TO_BACKEND_MAP` and `MambaAttentionBackendEnum` in [`registry.py`](../../../vllm/attention/backends/registry.py) when adding a new mamba backend.
 Finally, if one wants to support torch compile and CUDA graphs, it necessary to wrap the call to the mamba-like layer inside a custom op and register it.
 Please see the calls to `direct_register_custom_op` in [vllm/model_executor/models/minimax_text_01.py](../../../vllm/model_executor/models/minimax_text_01.py) or [vllm/model_executor/layers/mamba/short_conv.py](../../../vllm/model_executor/layers/mamba/short_conv.py) for examples of this.
 The new custom op should then be added to the list `_attention_ops` in [vllm/config/compilation.py](../../../vllm/config/compilation.py) to ensure that piecewise CUDA graphs works as intended.
diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md
index a590ecd6a1a2..fca941acd507 100644
--- a/docs/contributing/model/transcription.md
+++ b/docs/contributing/model/transcription.md
@@ -249,7 +249,7 @@ No extra registration is required beyond having your model class available via t
 ## Examples in-tree
 
 - Whisper encoder–decoder (audio-only): [vllm/model_executor/models/whisper.py](../../../vllm/model_executor/models/whisper.py)
-- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py)
+- Voxtral decoder-only (audio embeddings + LLM): [vllm/model_executor/models/voxtral.py](../../../vllm/model_executor/models/voxtral.py). Make sure to have installed `mistral-common[audio]`.
 - Gemma3n decoder-only with fixed instruction prompt: [vllm/model_executor/models/gemma3n_mm.py](../../../vllm/model_executor/models/gemma3n_mm.py)
 
 ## Test with the API
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index 7941b1f49ee8..65382afbe4f2 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -11,6 +11,8 @@ We support tracing vLLM workers using the `torch.profiler` module. You can enabl
 - `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default
 - `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default
 - `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default
+- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default
+- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default
 
 The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set.
 
@@ -224,6 +226,6 @@ snakeviz expensive_function.prof
 
 Leverage VLLM_GC_DEBUG environment variable to debug GC costs.
 
-- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elpased times
+- VLLM_GC_DEBUG=1: enable GC debugger with gc.collect elapsed times
 - VLLM_GC_DEBUG='{"top_objects":5}': enable GC debugger to log top 5
   collected objects for each gc.collect
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
index 1c639f3533d4..0e636c87f38a 100644
--- a/docs/deployment/docker.md
+++ b/docs/deployment/docker.md
@@ -82,8 +82,7 @@ DOCKER_BUILDKIT=1 docker build . \
 
 ## Building for Arm64/aarch64
 
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
 
 !!! note
     Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
@@ -94,7 +93,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
     ```bash
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-    python3 use_existing_torch.py
     DOCKER_BUILDKIT=1 docker build . \
     --file docker/Dockerfile \
     --target vllm-openai \
@@ -102,7 +100,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     -t vllm/vllm-gh200-openai:latest \
     --build-arg max_jobs=66 \
     --build-arg nvcc_threads=2 \
-    --build-arg torch_cuda_arch_list="9.0 10.0+PTX"
+    --build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
+    --build-arg RUN_WHEEL_CHECK=false
     ```
 
 !!! note
diff --git a/docs/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
index 002935da5600..5f7cef1a87df 100644
--- a/docs/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -29,8 +29,8 @@ pip install vllm
     - API Path: `/chat/completions`
     - Model: `qwen/Qwen1.5-0.5B-Chat`
 
-    ![](../../assets/deployment/chatbox-settings.png)
+    ![Chatbox settings screen](../../assets/deployment/chatbox-settings.png)
 
 1. Go to `Just chat`, and start to chat:
 
-    ![](../../assets/deployment/chatbox-chat.png)
+    ![Chatbot chat screen](../../assets/deployment/chatbox-chat.png)
diff --git a/docs/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
index 820ef0cbed9f..673cbf4b6a24 100644
--- a/docs/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -46,12 +46,12 @@ And install [Docker](https://docs.docker.com/engine/install/) and [Docker Compos
     - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
     - **Completion Mode**: `Completion`
 
-    ![](../../assets/deployment/dify-settings.png)
+    ![Dify settings screen](../../assets/deployment/dify-settings.png)
 
 1. To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
 
-    ![](../../assets/deployment/dify-create-chatbot.png)
+    ![Dify create chatbot screen](../../assets/deployment/dify-create-chatbot.png)
 
 1. Click the chatbot you just created to open the chat interface and start interacting with the model:
 
-    ![](../../assets/deployment/dify-chat.png)
+    ![Dify chat screen](../../assets/deployment/dify-chat.png)
diff --git a/docs/deployment/frameworks/hf_inference_endpoints.md b/docs/deployment/frameworks/hf_inference_endpoints.md
index d39bb9a899c8..05df0dacd8f1 100644
--- a/docs/deployment/frameworks/hf_inference_endpoints.md
+++ b/docs/deployment/frameworks/hf_inference_endpoints.md
@@ -156,7 +156,7 @@ In this guide, we demonstrate manual deployment using the [`rednote-hilab/dots.o
 
 ## Advanced Deployment Details
 
-With the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
+With the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html), vLLM now offers Day 0 support for any model compatible with `transformers`. This means you can deploy such models immediately, leveraging vLLM’s optimized inference without additional backend modifications.
 
 Hugging Face Inference Endpoints provides a fully managed environment for serving models via vLLM. You can deploy models without configuring servers, installing dependencies, or managing clusters. Endpoints also support deployment across multiple cloud providers (AWS, Azure, GCP) without the need for separate accounts.
 
@@ -167,4 +167,4 @@ The platform integrates seamlessly with the Hugging Face Hub, allowing you to de
 - Explore the [Inference Endpoints](https://endpoints.huggingface.co/catalog) model catalog
 - Read the Inference Endpoints [documentation](https://huggingface.co/docs/inference-endpoints/en/index)
 - Learn about [Inference Endpoints engines](https://huggingface.co/docs/inference-endpoints/en/engines/vllm)
-- Understand the [transformers backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
+- Understand the [Transformers modeling backend integration](https://blog.vllm.ai/2025/04/11/transformers-backend.html)
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
index f4a984a6433e..e9b0d5f0671c 100644
--- a/docs/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -4,7 +4,7 @@
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
 
-vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc., can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
 
 ## Prerequisites
 
diff --git a/docs/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
index edf79fca4f93..37b29aa1a487 100644
--- a/docs/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -2,4 +2,4 @@
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
-Please see [this guide](https://kserve.github.io/website/latest/modelserving/v1beta1/llm/huggingface/) for more details on using vLLM with KServe.
+Please see [this guide](https://kserve.github.io/website/docs/model-serving/generative-inference/overview) for more details on using vLLM with KServe.
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
index aac7b76eea26..7baadf8ba23c 100644
--- a/docs/design/cuda_graphs.md
+++ b/docs/design/cuda_graphs.md
@@ -84,12 +84,14 @@ See the following figures for a quick comparison between the previous and curren
 ```python
 class BatchDescriptor(NamedTuple):
     num_tokens: int
-    uniform_decode: bool = False
+    num_reqs: int
+    uniform: bool = False
+    has_lora: bool = False
 ```
 
-where `num_tokens` can be the padded token length, and `uniform_decode` is determined by if `max_query_len` of a batch is equal to the desired `max_query_len` of a uniform_decode, and the num_scheduled_tokens is divisible by that desired `max_query_len`.
+where `num_tokens` can be the padded token length, and `uniform` indicates if all the requests have the same query lengths. Many attention backends only support full cudagraphs when the batches are uniform; pure decode batches are uniform but may not be query length 1 (i.e. `num_tokens == num_reqs`), this occurs in the validation pass of spec-decode where "decode" batches will have a query length of  `1+num_spec_tokens`.
 
-The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item. We are safe to exclude items like `uniform_query_len` because it is a constant at runtime for a certain setup currently. For example, it should be either `1` for a commonly pure decode or `1+num_spec_tokens` for a validation phase of speculative decode.
+The goal of this structure is to uniquely identify a (padded) batch with minimal possible items corresponding to a CUDA Graphs item.
 
 !!! note
     The prototype of `BatchDescriptor` may be extended for more general situations in the future, e.g., include more items, like `uniform_query_len` to support multiple different uniform decode lengths settings (<https://github.com/vllm-project/vllm/pull/23679>), or other modifications needed to support CUDA Graphs for models whose inputs are not necessarily token length aware (for example, some multi-modal inputs).
@@ -128,7 +130,7 @@ A [CUDAGraphWrapper][vllm.compilation.cuda_graph.CUDAGraphWrapper] instance wrap
 3. Otherwise, i.e., the runtime_mode matches the mode of the wrapper, the wrapper will perform CUDA Graphs capture (if key does not exist, create
 a new entry and cache it) or replay (if key exists in the cache).
 
-The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and cenralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
+The above steps are based on the assumption that the CUDA Graphs wrapper would directly trust what’s in the forward context (controlled by the dispatcher). This lets us simplify and centralize the logic, reducing the complexity as well as the risk of mismatched state between the wrappers and the dispatcher. It also allows reusing the wrapper class for both `FULL` and `PIECEWISE` runtime modes. See the implementation [here](https://github.com/vllm-project/vllm/blob/f751e50b7a2aae3110d83ed0d88202fc91b3e78a/vllm/compilation/cuda_graph.py#L106).
 
 #### Nested Wrapper design
 
diff --git a/docs/design/debug_vllm_compile.md b/docs/design/debug_vllm_compile.md
index 3b454e851b54..e565f17da62a 100644
--- a/docs/design/debug_vllm_compile.md
+++ b/docs/design/debug_vllm_compile.md
@@ -8,9 +8,9 @@ TL;DR:
 | Online Flag | Offline Flag   |      Result |
 |----------|----------|-------------|
 | --enforce-eager | enforce_eager=True |  Turn off torch.compile and CUDAGraphs |
-| -O.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
-| -O.cudagraph_mode=NONE | compilation_config=CompilationConfig(mode=CompilationMode.NONE) |  Turn off CUDAGraphs only |
-| -O.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
+| -cc.mode=0 | mode=CompilationMode.NONE |  Turn off torch.compile only |
+| -cc.cudagraph_mode=NONE | compilation_config=CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE) |  Turn off CUDAGraphs only |
+| -cc.backend=eager | compilation_config=CompilationConfig(backend='eager') |  Turn off TorchInductor |
 
 ## vLLM-torch.compile overview
 
@@ -86,11 +86,11 @@ LLM(model, enforce_eager=True)
 ```
 
 To turn off just torch.compile, pass `mode = NONE` to the compilation config.
-(`-O` is short for `--compilation_config`):
+(`-cc` is short for `--compilation_config`; `-O.*` dotted syntax is deprecated):
 
 ```sh
 # Online
-vllm serve -O.mode=0
+vllm serve -cc.mode=0
 ```
 
 ```py
@@ -103,7 +103,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 
 ```py
@@ -151,6 +151,76 @@ To avoid this, please either:
 2. wrap the branching logic into a custom operator. TorchDynamo does not
 trace into custom operators.
 
+## Debugging constraint violations and dynamic shapes guards issues
+
+Dynamic-shape guards are a specific category of Dynamo guards. They are constraints that `torch.compile`
+attaches to dynamic dimensions (e.g., `seq_len`) to ensure the compiled artifact remains valid.
+These guards typically appear when framework code, custom passes, or user code branches based on
+dynamic shape values.
+
+**Example:**
+
+```python
+if x > 10:
+    # path A
+else:
+    # path B
+```
+
+This creates a guard `x > 10` or `x <= 10` depending on which path was traced.
+
+**vLLM's Assumption:**
+vLLM assumes that all guards added by torch.compile are safe to drop and will not
+constrain the compiled graph to specific input shapes. When this assumption is violated,
+it can cause issues that users need to debug.
+Some side effects that indicates this assumption is violated are runtime errors
+or `ConstraintViolationErrors`.
+
+A `ConstraintViolationErrors` will be thrown if a dynamic shape gets constrained to
+a single value. If you encounter a constraint violation error or suspect that a dynamic
+shapes guard is being added incorrectly, you can use stricter dynamic shape modes to
+help debug the issue:
+
+```sh
+# Online - using unbacked mode
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
+
+# Online - using backed_size_oblivious mode
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=backed_size_oblivious
+```
+
+```py
+# Offline - using unbacked mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.UNBACKED)
+))
+
+# Offline - using backed_size_oblivious mode
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+LLM(model, compilation_config=CompilationConfig(
+    dynamic_shapes_config=DynamicShapesConfig(type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS)
+))
+```
+
+These modes are stricter and reduce or eliminate the need of dynamic shapes guarding, which can help isolate issues:
+
+- `unbacked`: Uses unbacked symints which don't allow guards, making it easier to identify where guards are being incorrectly added
+- `backed_size_oblivious`: Uses a mode that is more strict about guarding.
+
+For more details on dynamic shapes modes, see [Dynamic shapes and vLLM guard dropping](torch_compile.md#dynamic-shapes-and-vllm-guard-dropping).
+
+### Printing guards
+
+To see all guards that are being added during compilation, you can use `TORCH_LOGS=+dynamic`:
+
+```sh
+TORCH_LOGS=+dynamic vllm serve meta-llama/Llama-3.2-1B
+```
+
+Look for `[guard added]` in the logs to see where guards are being added. This can help you identify which operations are
+causing guards to be added incorrectly.
+
 ## Debugging TorchInductor
 
 TorchInductor takes a captured graph and then compiles it down to some Python code
@@ -163,7 +233,7 @@ to the compilation config:
 
 ```sh
 # online
-vllm serve -O.backend=eager
+vllm serve -cc.backend=eager
 ```
 
 ```py
@@ -182,7 +252,7 @@ You can also use `TORCH_LOGS=output_code <command>` to print the Inductor output
 ### Editable TorchInductor code
 
 You can edit the TorchInductor code that gets run by setting `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`
-or passing `-O.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
+or passing `-cc.compile_cache_save_format=unpacked`. The default is `binary`, which means it is not editable.
 
 This is a useful technique: you can put breakpoints (e.g. `torch.distributed.breakpoint()`)
 and print statements in the output code.
@@ -229,7 +299,7 @@ To turn off just CUDAGraphs, pass `cudagraph_mode = NONE`:
 
 ```sh
 # Online
-vllm serve -O.cudagraph_mode=NONE
+vllm serve -cc.cudagraph_mode=NONE
 ```
 
 ```py
diff --git a/docs/design/fused_moe_modular_kernel.md b/docs/design/fused_moe_modular_kernel.md
index 76df0d8d8a38..e1a96be6c344 100644
--- a/docs/design/fused_moe_modular_kernel.md
+++ b/docs/design/fused_moe_modular_kernel.md
@@ -19,9 +19,9 @@ The input activation format completely depends on the All2All Dispatch being use
 
 The FusedMoE operation is generally made of multiple operations, in both the Contiguous and Batched variants, as described in the diagrams below
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png "FusedMoE Non-Batched")
+![FusedMoE Non-Batched](../assets/design/fused_moe_modular_kernel/fused_moe_non_batched.png)
 
-![](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png "FusedMoE Batched")
+![FusedMoE Batched](../assets/design/fused_moe_modular_kernel/fused_moe_batched.png)
 
 !!! note
     The main difference, in terms of operations, between the Batched and Non-Batched cases is the Permute / Unpermute operations. All other operations remain.
@@ -57,7 +57,7 @@ The `FusedMoEModularKernel` acts as a bridge between the `FusedMoEPermuteExperts
 The `FusedMoEPrepareAndFinalize` abstract class exposes `prepare`, `prepare_no_receive`  and `finalize` functions.
 The `prepare` function is responsible for input activation Quantization and All2All Dispatch. If implemented, The `prepare_no_receive` is like `prepare` except it does not wait to receive results from other workers.  Instead it returns a "receiver" callback that must be invoked to wait for the final results of worker. It is not required that this method is supported by all `FusedMoEPrepareAndFinalize` classes, but if it is available, it can be used to interleave work with the initial all to all communication, e.g. interleaving shared experts with fused experts.  The `finalize` function is responsible for invoking the All2All Combine. Additionally the `finalize` function may or may not do the TopK weight application and reduction (Please refer to the TopKWeightAndReduce section)
 
-![](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png "FusedMoEPrepareAndFinalize Blocks")
+![FusedMoEPrepareAndFinalize Blocks](../assets/design/fused_moe_modular_kernel/prepare_and_finalize_blocks.png)
 
 ### FusedMoEPermuteExpertsUnpermute
 
@@ -88,7 +88,7 @@ The core FusedMoE implementation performs a series of operations. It would be in
 It is sometimes efficient to perform TopK weight application and Reduction inside the `FusedMoEPermuteExpertsUnpermute::apply()`. Find an example [here](https://github.com/vllm-project/vllm/pull/20228). We have a `TopKWeightAndReduce` abstract class to facilitate such implementations. Please refer to the TopKWeightAndReduce section.
 `FusedMoEPermuteExpertsUnpermute::finalize_weight_and_reduce_impl()` returns the `TopKWeightAndReduce` object that the implementation wants the `FusedMoEPrepareAndFinalize::finalize()` to use.
 
-![](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png "FusedMoEPermuteExpertsUnpermute Blocks")
+![FusedMoEPermuteExpertsUnpermute Blocks](../assets/design/fused_moe_modular_kernel/fused_experts_blocks.png)
 
 ### FusedMoEModularKernel
 
diff --git a/docs/design/huggingface_integration.md b/docs/design/huggingface_integration.md
index 412ce658b92a..1109abf6cb93 100644
--- a/docs/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -21,7 +21,7 @@ Let's say we want to serve the popular Qwen model by running `vllm serve Qwen/Qw
 
 Beyond that, there are two more things vLLM depends on Hugging Face for.
 
-1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
+1. **Tokenizer**: vLLM uses the tokenizer from Hugging Face to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check Hugging Face's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [vllm.tokenizers.hf.get_cached_tokenizer][].
 
 2. **Model weight**: vLLM downloads the model weight from the Hugging Face model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
     - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
diff --git a/docs/design/io_processor_plugins.md b/docs/design/io_processor_plugins.md
index 2f4b17f191a5..b4a30cda35a0 100644
--- a/docs/design/io_processor_plugins.md
+++ b/docs/design/io_processor_plugins.md
@@ -1,6 +1,6 @@
 # IO Processor Plugins
 
-IO Processor plugins are a feature that allows pre and post processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
+IO Processor plugins are a feature that allows pre- and post-processing of the model input and output for pooling models. The idea is that users are allowed to pass a custom input to vLLM that is converted into one or more model prompts and fed to the model `encode` method. One potential use-case of such plugins is that of using vLLM for generating multi-modal data. Say users feed an image to vLLM and get an image in output.
 
 When performing an inference with IO Processor plugins, the prompt type is defined by the plugin and the same is valid for the final request output. vLLM does not perform any validation of input/output data, and it is up to the plugin to ensure the correct data is being fed to the model and returned to the user. As of now these plugins support only pooling models and can be triggered via the `encode` method in `LLM` and `AsyncLLM`, or in online serving mode via the `/pooling` endpoint.
 
@@ -77,7 +77,7 @@ The `parse_request` method is used for validating the user prompt and converting
 The `pre_process*` methods take the validated plugin input to generate vLLM's model prompts for regular inference.
 The `post_process*` methods take `PoolingRequestOutput` objects as input and generate a custom plugin output.
 The `validate_or_generate_params` method is used for validating with the plugin any `SamplingParameters`/`PoolingParameters` received with the user request, or to generate new ones if none are specified. The function always returns the validated/generated parameters.
-The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/openai/serving_pooling.py).
+The `output_to_response` method is used only for online serving and converts the plugin output to the `IOProcessorResponse` type that is then returned by the API Server. The implementation of the `/pooling` serving endpoint is available here [vllm/entrypoints/openai/serving_pooling.py](../../vllm/entrypoints/pooling/pooling/serving.py).
 
 An example implementation of a plugin that enables generating geotiff images with the PrithviGeospatialMAE model is available [here](https://github.com/IBM/terratorch/tree/main/terratorch/vllm/plugins/segmentation). Please, also refer to our online ([examples/online_serving/pooling/prithvi_geospatial_mae.py](../../examples/online_serving/pooling/prithvi_geospatial_mae.py)) and offline ([examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py](../../examples/offline_inference/pooling/prithvi_geospatial_mae_io_processor.py)) inference examples.
 
diff --git a/docs/design/logits_processors.md b/docs/design/logits_processors.md
index acf7fc245462..8eadeb386fcf 100644
--- a/docs/design/logits_processors.md
+++ b/docs/design/logits_processors.md
@@ -411,7 +411,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -548,7 +548,7 @@ Built-in logits processors are always loaded when the vLLM engine starts. See th
 
 Review these logits processor implementations for guidance on writing built-in logits processors.
 
-Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforemented logits processor programming model.
+Additionally, the following logits-processor-like functionalities are hard-coded into the sampler and do not yet utilize the programming model described above. Most of them will be refactored to use the aforementioned logits processor programming model.
 
 * Allowed token IDs
 
diff --git a/docs/design/lora_resolver_plugins.md b/docs/design/lora_resolver_plugins.md
new file mode 100644
index 000000000000..bd0dc6dc9c7b
--- /dev/null
+++ b/docs/design/lora_resolver_plugins.md
@@ -0,0 +1,220 @@
+# LoRA Resolver Plugins
+
+This directory contains vLLM's LoRA resolver plugins built on the `LoRAResolver` framework.
+They automatically discover and load LoRA adapters from a specified local storage path, eliminating the need for manual configuration or server restarts.
+
+## Overview
+
+LoRA Resolver Plugins provide a flexible way to dynamically load LoRA adapters at runtime. When vLLM
+receives a request for a LoRA adapter that hasn't been loaded yet, the resolver plugins will attempt
+to locate and load the adapter from their configured storage locations. This enables:
+
+- **Dynamic LoRA Loading**: Load adapters on-demand without server restarts
+- **Multiple Storage Backends**: Support for filesystem, S3, and custom backends. The built-in `lora_filesystem_resolver` requires a local storage path, but custom resolvers can be implemented to fetch from any source.
+- **Automatic Discovery**: Seamless integration with existing LoRA workflows
+- **Scalable Deployment**: Centralized adapter management across multiple vLLM instances
+
+## Prerequisites
+
+Before using LoRA Resolver Plugins, ensure the following environment variables are configured:
+
+### Required Environment Variables
+
+1. **`VLLM_ALLOW_RUNTIME_LORA_UPDATING`**: Must be set to `true` or `1` to enable dynamic LoRA loading
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   ```
+
+2. **`VLLM_PLUGINS`**: Must include the desired resolver plugins (comma-separated list)
+   ```bash
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   ```
+
+3. **`VLLM_LORA_RESOLVER_CACHE_DIR`**: Must be set to a valid directory path for filesystem resolver
+   ```bash
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+### Optional Environment Variables
+
+- **`VLLM_PLUGINS`**: If not set, all available plugins will be loaded. If set to empty string, no plugins will be loaded.
+
+## Available Resolvers
+
+### lora_filesystem_resolver
+
+The filesystem resolver is installed with vLLM by default and enables loading LoRA adapters from a local directory structure.
+
+#### Setup Steps
+
+1. **Create the LoRA adapter storage directory**:
+   ```bash
+   mkdir -p /path/to/lora/adapters
+   ```
+
+2. **Set environment variables**:
+   ```bash
+   export VLLM_ALLOW_RUNTIME_LORA_UPDATING=true
+   export VLLM_PLUGINS=lora_filesystem_resolver
+   export VLLM_LORA_RESOLVER_CACHE_DIR=/path/to/lora/adapters
+   ```
+
+3. **Start vLLM server**:
+   Your base model can be `meta-llama/Llama-2-7b-hf`. Please make sure you set up the Hugging Face token in your env var `export HF_TOKEN=xxx235`.
+   ```bash
+   python -m vllm.entrypoints.openai.api_server \
+       --model your-base-model \
+       --enable-lora
+   ```
+
+#### Directory Structure Requirements
+
+The filesystem resolver expects LoRA adapters to be organized in the following structure:
+
+```text
+/path/to/lora/adapters/
+├── adapter1/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+├── adapter2/
+│   ├── adapter_config.json
+│   ├── adapter_model.bin
+│   └── tokenizer files (if applicable)
+└── ...
+```
+
+Each adapter directory must contain:
+
+- **`adapter_config.json`**: Required configuration file with the following structure:
+  ```json
+  {
+    "peft_type": "LORA",
+    "base_model_name_or_path": "your-base-model-name",
+    "r": 16,
+    "lora_alpha": 32,
+    "target_modules": ["q_proj", "v_proj"],
+    "bias": "none",
+    "modules_to_save": null,
+    "use_rslora": false,
+    "use_dora": false
+  }
+  ```
+
+- **`adapter_model.bin`**: The LoRA adapter weights file
+
+#### Usage Example
+
+1. **Prepare your LoRA adapter**:
+   ```bash
+   # Assuming you have a LoRA adapter in /tmp/my_lora_adapter
+   cp -r /tmp/my_lora_adapter /path/to/lora/adapters/my_sql_adapter
+   ```
+
+2. **Verify the directory structure**:
+   ```bash
+   ls -la /path/to/lora/adapters/my_sql_adapter/
+   # Should show: adapter_config.json, adapter_model.bin, etc.
+   ```
+
+3. **Make a request using the adapter**:
+   ```bash
+   curl http://localhost:8000/v1/completions \
+       -H "Content-Type: application/json" \
+       -d '{
+           "model": "my_sql_adapter",
+           "prompt": "Generate a SQL query for:",
+           "max_tokens": 50,
+           "temperature": 0.1
+       }'
+   ```
+
+#### How It Works
+
+1. When vLLM receives a request for a LoRA adapter named `my_sql_adapter`
+2. The filesystem resolver checks if `/path/to/lora/adapters/my_sql_adapter/` exists
+3. If found, it validates the `adapter_config.json` file
+4. If the configuration matches the base model and is valid, the adapter is loaded
+5. The request is processed normally with the newly loaded adapter
+6. The adapter remains available for future requests
+
+## Advanced Configuration
+
+### Multiple Resolvers
+
+You can configure multiple resolver plugins to load adapters from different sources:
+
+'lora_s3_resolver' is an example of a custom resolver you would need to implement
+
+```bash
+export VLLM_PLUGINS=lora_filesystem_resolver,lora_s3_resolver
+```
+
+All listed resolvers are enabled; at request time, vLLM tries them in order until one succeeds.
+
+### Custom Resolver Implementation
+
+To implement your own resolver plugin:
+
+1. **Create a new resolver class**:
+   ```python
+   from vllm.lora.resolver import LoRAResolver, LoRAResolverRegistry
+   from vllm.lora.request import LoRARequest
+   
+   class CustomResolver(LoRAResolver):
+       async def resolve_lora(self, base_model_name: str, lora_name: str) -> Optional[LoRARequest]:
+           # Your custom resolution logic here
+           pass
+   ```
+
+2. **Register the resolver**:
+   ```python
+   def register_custom_resolver():
+       resolver = CustomResolver()
+       LoRAResolverRegistry.register_resolver("Custom Resolver", resolver)
+   ```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **"VLLM_LORA_RESOLVER_CACHE_DIR must be set to a valid directory"**
+   - Ensure the directory exists and is accessible
+   - Check file permissions on the directory
+
+2. **"LoRA adapter not found"**
+   - Verify the adapter directory name matches the requested model name
+   - Check that `adapter_config.json` exists and is valid JSON
+   - Ensure `adapter_model.bin` exists in the directory
+
+3. **"Invalid adapter configuration"**
+   - Verify `peft_type` is set to "LORA"
+   - Check that `base_model_name_or_path` matches your base model
+   - Ensure `target_modules` is properly configured
+
+4. **"LoRA rank exceeds maximum"**
+   - Check that `r` value in `adapter_config.json` doesn't exceed `max_lora_rank` setting
+
+### Debugging Tips
+
+1. **Enable debug logging**:
+   ```bash
+   export VLLM_LOGGING_LEVEL=DEBUG
+   ```
+
+2. **Verify environment variables**:
+   ```bash
+   echo $VLLM_ALLOW_RUNTIME_LORA_UPDATING
+   echo $VLLM_PLUGINS
+   echo $VLLM_LORA_RESOLVER_CACHE_DIR
+   ```
+
+3. **Test adapter configuration**:
+   ```bash
+   python -c "
+   import json
+   with open('/path/to/lora/adapters/my_adapter/adapter_config.json') as f:
+       config = json.load(f)
+   print('Config valid:', config)
+   "
+   ```
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index ee224e6922fb..44aaa65218cc 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -1,22 +1,22 @@
-# Fused MoE Kernel features
+# Fused MoE Kernel Features
 
 The purpose of this document is to provide an overview of the various MoE kernels (both modular and non-modular) so it will be easier to select an appropriate set of kernels for any particular situation. This includes information about the all2all backends used by modular kernels.
 
 ## Fused MoE Modular All2All backends
 
-There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` sub-classes provide an interface for each all2all backend.
+There are a number of all2all communication backends that are used to implement expert parallelism (EP) for the `FusedMoE` layer. The different `FusedMoEPrepareAndFinalize` subclasses provide an interface for each all2all backend.
 
 The following table describes the relevant features of each backend, i.e. activation format, supported quantization schemes and async support.
 
-The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, the finalize step requires the same format. All the backend `prepare` methods expect activations in standard format and all the `finalize methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
+The output activation format (standard or batched) corresponds to the output of the prepare step of the `FusedMoEPrepareAndFinalize` subclass, and the finalize step requires the same format. All the backend `prepare` methods expect activations in the standard format and all the `finalize` methods return activations in standard format. More details on the formats can be found in the [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) document.
 
-The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports. e.g. deepep_high_throughput supports only block-quantized fp8 format, any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type.  The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 w/per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process.  If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
+The quantization types and formats enumerate which quantization schemes are supported by each `FusedMoEPrepareAndFinalize` class. The quantization can happen before or after the dispatch based on the format the all2all backend supports, e.g. deepep_high_throughput supports only block-quantized fp8 format. Any other format will result in dispatching in higher precision and quantizing afterwards. The output of the prepare step for each backend is the quantized type. The finalize step generally requires the same input type as the original activations, e.g. if the original input is bfloat16 and the quantization scheme is fp8 with per-tensor scales, `prepare` will return fp8/per-tensor scale activations and `finalize` will take bfloat16 activations. See the diagrams in [Fused MoE Modular Kernel](./fused_moe_modular_kernel.md) for more details on the types and formats of activations at each step of the MoE process. If no quantization type is specified, the kernel operates on float16 and/or bfloat16.
 
 Async backends support the use of DBO (Dual Batch Overlap) and shared expert overlap (where shared experts are computed during the combine step).
 
-Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass, for non-modular kernels, it is up to the experts function to deal with this flag.
+Certain models require the topk weights to be applied to the input activations rather than the output activations when topk==1, e.g. Llama. For modular kernels, this feature is supported by the `FusedMoEPrepareAndFinalize` subclass. For non-modular kernels, it is up to the experts function to deal with this flag.
 
-unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`.  All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP w/o EP.
+Unless otherwise specified, backends are controlled via `VLLM_ALL2ALL_BACKEND`. All backends except `flashinfer` only work with EP+DP or EP+TP. `Flashinfer` can work with EP or DP without EP.
 
 <style>
 td {
@@ -30,24 +30,23 @@ th {
 }
 </style>
 
-| Backend                               | Output act. format | Quant. types    | Quant. format          | Async | Apply Weight On Input | Sub-class                                                                                                                                                     |
-|---------------------------------------|--------------------|-----------------|------------------------|-------|-----------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| naive                                 | standard           | all<sup>1</sup> | G,A,T                  | N     | <sup>6</sup>          | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl]                                                                                  |
-| pplx                                  | batched            | fp8,int8        | G,A,T                  | Y     | Y                     | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize]                                                 |
-| deepep_high_throughput                | standard           | fp8             | G(128),A,T<sup>2</sup> | Y     | Y                     | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize]                                    |
-| deepep_low_latency                    | batched            | fp8             | G(128),A,T<sup>3</sup> | Y     | Y                     | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize]                                    |
-| flashinfer_all2allv                   | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
-| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
-| flashinfer<sup>4</sup>                | standard           | nvfp4,fp8       | G,A,T                  | N     | N                     | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize]   |
-| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard           | fp8,int8        | G,A,T                  | N     | Y                     | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP]                                                |
-| BatchedPrepareAndFinalize<sup>5</sup> | batched            | fp8,int8        | G,A,T                  | N     | Y                     | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize]                                               |
+| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
+|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
+| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
+| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
+| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
+| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
+| flashinfer_all2allv | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferAllToAllMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferAllToAllMoEPrepareAndFinalize] |
+| flashinfer<sup>4</sup> | standard | nvfp4,fp8 | G,A,T | N | N | [`FlashInferCutlassMoEPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize.FlashInferCutlassMoEPrepareAndFinalize] |
+| MoEPrepareAndFinalizeNoEP<sup>5</sup> | standard | fp8,int8 | G,A,T | N | Y | [`MoEPrepareAndFinalizeNoEP`][vllm.model_executor.layers.fused_moe.prepare_finalize.MoEPrepareAndFinalizeNoEP] |
+| BatchedPrepareAndFinalize<sup>5</sup> | batched | fp8,int8 | G,A,T | N | Y | [`BatchedPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedPrepareAndFinalize] |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
     2. A,T quantization occurs after dispatch.
     3. All quantization happens after dispatch.
     4. Controlled by different env vars (`VLLM_FLASHINFER_MOE_BACKEND` "throughput" or "latency")
-    5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs w/o dispatch or combine.  These cannot be selected via environment variable.  These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
+    5. This is a no-op dispatcher that can be used to pair with any modular experts to produce a modular kernel that runs without dispatch or combine. These cannot be selected via environment variable. These are generally use for testing or adapting an expert subclass to the `fused_experts` API.
     6. This depends on the experts implementation.
 
     ---
@@ -61,49 +60,48 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 
 - [`ModelOptFp8MoEMethod`][vllm.model_executor.layers.quantization.modelopt.ModelOptFp8MoEMethod]
 - [`Fp8MoEMethod`][vllm.model_executor.layers.quantization.fp8.Fp8MoEMethod]
-- [`CompressedTensorsW4A4MoeMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4MoeMethod]
+- [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW4A4Nvfp4MoEMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.CompressedTensorsW8A8Fp8MoEMethod]
 - [`Mxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.Mxfp4MoEMethod]
 - [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
 
-## Fused MoE Experts Kernels
+## Fused Experts Kernels
 
-The are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adatpers so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
+There are a number of MoE experts kernel implementations for different quantization types and architectures. Most follow the general API of the base Triton [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts] function. Many have modular kernel adapters, so they can be used with compatible all2all backends. This table lists each experts kernel and its particular properties.
 
-Each kernel must be provided with one of the supported input activation formats.  Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx`, `DeepEPLLPrepareAndFinalize`.
+Each kernel must be provided with one of the supported input activation formats. Some flavors of kernels support both standard and batched formats through different entry points, e.g. `TritonExperts` and `BatchedTritonExperts`. Batched format kernels are currently only needed for matching with certain all2all backends, e.g. `pplx` and `DeepEPLLPrepareAndFinalize`.
 
 Similar to the backend kernels, each experts kernel only supports certain quantization formats. For non-modular experts, the activations will be in the original type and quantized internally by the kernel. Modular experts will expect the activations to already be in the quantized format. Both types of experts will yield outputs in the original activation type.
 
-Each experts kernel supports one or more activation functions, e.g. silu, gelu that are applied to the intermediate results.
+Each experts kernel supports one or more activation functions, e.g. silu or gelu, which are applied to the intermediate results.
 
 As with the backends, some experts support applying topk weights on the input activations. The entries in the column in this table only apply to the non-modular experts.
 
 Most experts flavors include an equivalent modular interface which will be a subclass of `FusedMoEPermuteExpertsUnpermute`.
 
-To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels must have compatible activation formats, quantization types and quantization formats.
-
-| Kernel                       | Input act. format     | Quant. types     | Quant. format | Activation function                                         | Apply Weight On Input | Modular | Source                                                                                                                                                                                                                                                                                                      |
-|------------------------------|-----------------------|------------------|---------------|-------------------------------------------------------------|-----------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| triton                       | standard              | all<sup>1</sup>  | G,A,T         | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y                     | Y       | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts]                                                                                                                                        |
-| triton (batched)             | batched               | all<sup>1</sup>  | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts]                                                                                                                                                                                                       |
-| deep gemm                    | standard,</br>batched | fp8              | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
-| cutlass_fp4                  | standard,</br>batched | nvfp4            | A,T           | silu                                                        | Y                     | Y       | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4]                                                                                                                        |
-| cutlass_fp8                  | standard,</br>batched | fp8              | A,T           | silu, gelu                                                  | Y                     | Y       | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8]            |
-| flashinfer                   | standard              | nvfp4,</br>fp8   | T             | <sup>5</sup>                                                | N                     | Y       | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts]                                                                            |
-| gpt oss triton               | standard              | N/A              | N/A           | <sup>5</sup>                                                | Y                     | Y       | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts]                                                                    |
-| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup>  | G(128),A,T    | silu, gelu                                                  | <sup>6</sup>          | Y       | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts]                                                 |
-| marlin                       | standard              | <sup>3</sup>     | <sup>3</sup>  | silu,</br>swigluoai                                         | Y                     | Y       | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]          |
-| marlin experts               | standard,</br>batched | N/A              | N/A           | silu,</br>swigluoai                                         | Y                     | Y       | [`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts]                                                                                                            |
-| trtllm                       | standard              | mxfp4,</br>nvfp4 | G(16),G(32)   | <sup>5</sup>                                                | N                     | Y       | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts]                                                                                                                                                                                                                      |
-| pallas                       | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe]                                                                                                                                                                                                                                    |
-| iterative                    | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe]                                                                                                                                                                                                                           |
-| rocm aiter moe               | standard              | fp8              | G(128),A,T    | silu, gelu                                                  | Y                     | N       | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts]                                                                                                                                                                                           |
-| cpu_fused_moe                | standard              | N/A              | N/A           | silu                                                        | N                     | N       | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE]                                                                                                                                                                                                                             |
-| naive batched<sup>4</sup>    | batched               | int8,</br>fp8    | G,A,T         | silu, gelu                                                  | <sup>6</sup>          | Y       | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts]                                                                                                                                                                                                         |
+To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels must have compatible activation formats, quantization types and quantization formats.
+
+| Kernel | Input act. format | Quant. types | Quant. format | Activation function | Apply Weight On Input | Modular | Source |
+|--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
+| triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
+| triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
+| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp4],</br>[`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
+| cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
+| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
+| gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
+| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
+| marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
+| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
+| pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
+| iterative | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_torch_iterative.fused_moe] |
+| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
+| cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
+| naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
 
 !!! info "Table key"
     1. All types: mxfp4, nvfp4, int4, int8, fp8
-    2. A dispatcher wrapper around triton and deep gemm experts.  Will select based on type + shape + quantization params
+    2. A dispatcher wrapper around triton and deep gemm experts. Will select based on type + shape + quantization params
     3. uint4, uint8, fp8, fp4
     4. This is a naive implementation of experts that supports batched format. Mainly used for testing.
     5. The `activation` parameter is ignored and SwiGlu is used by default instead.
@@ -113,8 +111,8 @@ To be used with a particular `FusedMoEPrepareAndFinalize` sub-class, MoE kernels
 
 The following table shows "families" of modular kernels that are intended to work together. There are some combinations which may work but have not yet been tested, e.g. flashinfer with other fp8 experts. Note that the "naive" backend will work with any non-modular experts.
 
-| backend                          | `FusedMoEPrepareAndFinalize` subclasses                    | `FusedMoEPermuteExpertsUnpermute` subclasses                                                                               |
-|----------------------------------|------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------|
-| deepep_high_throughput           | `DeepEPHTPrepareAndFinalize`                               |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts`                                  |
-| deepep_low_latency,</br>pplx     | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts`|
-| flashinfer                       | `FlashInferCutlassMoEPrepareAndFinalize`                   | `FlashInferExperts`                                                                                                                                    |
+| backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
+|---------|-----------------------------------------|----------------------------------------------|
+| deepep_high_throughput | `DeepEPHTPrepareAndFinalize` |  `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
+| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` |  `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
diff --git a/docs/design/optimization_levels.md b/docs/design/optimization_levels.md
new file mode 100644
index 000000000000..940286071ef3
--- /dev/null
+++ b/docs/design/optimization_levels.md
@@ -0,0 +1,69 @@
+<!-- markdownlint-disable -->
+
+# Optimization Levels
+
+## Overview
+
+vLLM now supports optimization levels (`-O0`, `-O1`, `-O2`, `-O3`). Optimization levels provide an intuitive mechnaism for users to trade startup time for performance. Higher levels have better performance but worse startup time. These optimization levels have associated defaults to help users get desired out of the box performance. Importantly, defaults set by optimization levels are purely defaults; explicit user settings will not be overwritten.
+
+## Level Summaries and Usage Examples
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O0
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=0
+)
+```
+
+#### `-O1`: Quick Optimizations
+- **Startup**: Moderate startup time
+- **Performance**: Inductor compilation, CUDAGraphMode.PIECEWISE
+- **Use case**:  Balance for most development scenarios
+
+```bash
+# CLI usage
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O1
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=1
+)
+```
+
+#### `-O2`: Full Optimizations (Default)
+- **Startup**: Longer startup time
+- **Performance**: `-O1` + CUDAGraphMode.FULL_AND_PIECEWISE
+- **Use case**: Production workloads where performance is important. This is the default use case. It is also very similar to the previous default. The primary difference is that  noop & fusion flags are enabled. 
+
+```bash
+# CLI usage (default, so optional)
+python -m vllm.entrypoints.api_server --model RedHatAI/Llama-3.2-1B-FP8 -O2
+
+# Python API usage
+from vllm.entrypoints.llm import LLM
+
+llm = LLM(
+    model="RedHatAI/Llama-3.2-1B-FP8",
+    optimization_level=2  # This is the default
+)
+```
+
+#### `-O3`: Full Optimization
+Still in development. Added infrastructure to prevent changing API in future 
+release. Currently behaves the same O2.
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Startup Time Too Long**: Use `-O0` or `-O1` for faster startup
+2. **Compilation Errors**: Use `debug_dump_path` for additional debugging information
+3. **Performance Issues**: Ensure using `-O2` for production
\ No newline at end of file
diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md
index dc2f7c4aed3c..3485c40c3681 100644
--- a/docs/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -4,7 +4,7 @@ The community frequently requests the ability to extend vLLM with custom feature
 
 ## How Plugins Work in vLLM
 
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_plugins_by_group][vllm.plugins.load_plugins_by_group] function in the `vllm.plugins` module.
 
 ## How vLLM Discovers Plugins
 
@@ -49,7 +49,7 @@ Every plugin has three parts:
 
 - **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.
 
-- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre/post processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
+- **IO Processor plugins** (with group name `vllm.io_processor_plugins`): The primary use case for these plugins is to register custom pre-/post-processing of the model prompt and model output for pooling models. The plugin function returns the IOProcessor's class fully qualified name.
 
 - **Stat logger plugins** (with group name `vllm.stat_logger_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree loggers into vLLM. The entry point should be a class that subclasses StatLoggerBase.
 
@@ -57,6 +57,100 @@ Every plugin has three parts:
 
 - **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
 
+### Platform plugins guidelines
+
+1. Create a platform plugin project, for example, `vllm_add_dummy_platform`. The project structure should look like this:
+
+    ```shell
+    vllm_add_dummy_platform/
+    ├── vllm_add_dummy_platform/
+    │   ├── __init__.py
+    │   ├── my_dummy_platform.py
+    │   ├── my_dummy_worker.py
+    │   ├── my_dummy_attention.py
+    │   ├── my_dummy_device_communicator.py
+    │   ├── my_dummy_custom_ops.py
+    ├── setup.py
+    ```
+
+2. In the `setup.py` file, add the following entry point:
+
+    ```python
+    setup(
+        name="vllm_add_dummy_platform",
+        ...
+        entry_points={
+            "vllm.platform_plugins": [
+                "my_dummy_platform = vllm_add_dummy_platform:register"
+            ]
+        },
+        ...
+    )
+    ```
+
+    Please make sure `vllm_add_dummy_platform:register` is a callable function and returns the platform class's fully qualified name. for example:
+
+    ```python
+    def register():
+        return "vllm_add_dummy_platform.my_dummy_platform.MyDummyPlatform"
+    ```
+
+3. Implement the platform class `MyDummyPlatform` in `my_dummy_platform.py`. The platform class should inherit from `vllm.platforms.interface.Platform`. Please follow the interface to implement the functions one by one. There are some important functions and properties that should be implemented at least:
+
+    - `_enum`: This property is the device enumeration from [PlatformEnum][vllm.platforms.interface.PlatformEnum]. Usually, it should be `PlatformEnum.OOT`, which means the platform is out-of-tree.
+    - `device_type`: This property should return the type of the device which pytorch uses. For example, `"cpu"`, `"cuda"`, etc.
+    - `device_name`: This property is set the same as `device_type` usually. It's mainly used for logging purposes.
+    - `check_and_update_config`: This function is called very early in the vLLM's initialization process. It's used for plugins to update the vllm configuration. For example, the block size, graph mode config, etc, can be updated in this function. The most important thing is that the **worker_cls** should be set in this function to let vLLM know which worker class to use for the worker process.
+    - `get_attn_backend_cls`: This function should return the attention backend class's fully qualified name.
+    - `get_device_communicator_cls`: This function should return the device communicator class's fully qualified name.
+
+4. Implement the worker class `MyDummyWorker` in `my_dummy_worker.py`. The worker class should inherit from [WorkerBase][vllm.v1.worker.worker_base.WorkerBase]. Please follow the interface to implement the functions one by one. Basically, all interfaces in the base class should be implemented, since they are called here and there in vLLM. To make sure a model can be executed, the basic functions should be implemented are:
+
+    - `init_device`: This function is called to set up the device for the worker.
+    - `initialize_cache`: This function is called to set cache config for the worker.
+    - `load_model`: This function is called to load the model weights to device.
+    - `get_kv_cache_spaces`: This function is called to generate the kv cache spaces for the model.
+    - `determine_available_memory`: This function is called to profiles the peak memory usage of the model to determine how much memory can be used for KV cache without OOMs.
+    - `initialize_from_config`: This function is called to allocate device KV cache with the specified kv_cache_config
+    - `execute_model`: This function is called every step to inference the model.
+
+    Additional functions that can be implemented are:
+
+    - If the plugin wants to support sleep mode feature, please implement the `sleep` and `wakeup` functions.
+    - If the plugin wants to support graph mode feature, please implement the `compile_or_warm_up_model` function.
+    - If the plugin wants to support speculative decoding feature, please implement the `take_draft_token_ids` function.
+    - If the plugin wants to support lora feature, please implement the `add_lora`,`remove_lora`,`list_loras` and `pin_lora` functions.
+    - If the plugin wants to support data parallelism feature, please implement the `execute_dummy_batch` functions.
+
+    Please look at the worker base class [WorkerBase][vllm.v1.worker.worker_base.WorkerBase] for more functions that can be implemented.
+
+5. Implement the attention backend class `MyDummyAttention` in `my_dummy_attention.py`. The attention backend class should inherit from [AttentionBackend][vllm.attention.backends.abstract.AttentionBackend]. It's used to calculate attentions with your device. Take `vllm.v1.attention.backends` as examples, it contains many attention backend implementations.
+
+6. Implement custom ops for high performance. Most ops can be ran by pytorch native implementation, while the performance may not be good. In this case, you can implement specific custom ops for your plugins. Currently, there are kinds of custom ops vLLM supports:
+
+    - pytorch ops
+      there are 3 kinds of pytorch ops:
+
+        - `communicator ops`: Device communicator op. Such as all-reduce, all-gather, etc.
+          Please implement the device communicator class `MyDummyDeviceCommunicator` in `my_dummy_device_communicator.py`. The device communicator class should inherit from [DeviceCommunicatorBase][vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase].
+        - `common ops`: Common ops. Such as matmul, softmax, etc.
+          Please implement the common ops by register oot way. See more detail in [CustomOp][vllm.model_executor.custom_op.CustomOp] class.
+        - `csrc ops`: C++ ops. This kind of ops are implemented in C++ and are registered as torch custom ops.
+          Following csrc module and `vllm._custom_ops` to implement your ops.
+
+    - triton ops
+      Custom way doesn't work for triton ops now.
+
+7. (optional) Implement other plugable modules, such as lora, graph backend, quantization, mamba attention backend, etc.
+
 ## Compatibility Guarantee
 
-vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
+vLLM guarantees the interface of documented plugins, such as `ModelRegistry.register_model`, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, `"vllm_add_dummy_model.my_llava:MyLlava"` should be compatible with the version of vLLM that the plugin targets.
+
+The interface for the model/module may change during vLLM's development. If you see any deprecation log info, please upgrade your plugin to the latest version.
+
+## Deprecation announcement
+
+!!! warning "Deprecations"
+    - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0.
+    - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead.
diff --git a/docs/design/prefix_caching.md b/docs/design/prefix_caching.md
index bd4070f381d8..cf792fdabe1a 100644
--- a/docs/design/prefix_caching.md
+++ b/docs/design/prefix_caching.md
@@ -1,6 +1,6 @@
 # Automatic Prefix Caching
 
-Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc) and most open source LLM inference frameworks (e.g., SGLang).
+Prefix caching kv-cache blocks is a popular optimization in LLM inference to avoid redundant prompt computations. The core idea is simple – we cache the kv-cache blocks of processed requests, and reuse these blocks when a new request comes in with the same prefix as previous requests. Since prefix caching is almost a free lunch and won’t change model outputs, it has been widely used by many public endpoints (e.g., OpenAI, Anthropic, etc.) and most open source LLM inference frameworks (e.g., SGLang).
 
 While there are many ways to implement prefix caching, vLLM chooses a hash-based approach. Specifically, we hash each kv-cache block by the tokens in the block and the tokens in the prefix before the block:
 
@@ -94,9 +94,6 @@ To improve privacy in shared environments, vLLM supports isolating prefix cache
 
 With this setup, cache sharing is limited to users or requests that explicitly agree on a common salt, enabling cache reuse within a trust group while isolating others.
 
-!!! note
-    Cache isolation is not supported in engine V0.
-
 ## Data Structure
 
 The prefix caching in vLLM v1 is implemented in the KV cache manager. The basic building block is the “Block” data class (simplified):
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
index 27edc4f89201..4dc0da0c7d65 100644
--- a/docs/design/torch_compile.md
+++ b/docs/design/torch_compile.md
@@ -29,6 +29,109 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
 
 By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set the field `compile_cache_save_format=unpacked` in the compilation config, or omit this and set the env variable `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
 
+## Dynamic shapes and vllm guard dropping
+
+`torch.compile` is designed to guard on dynamic shapes with no hesitation
+when needed. This contradicts with vLLM's `torch.compile` approach of
+dropping the guards since many of those guards could be material.
+
+`torch.compile` provides two kinds of dynamic shapes: `backed` and `unbacked`.
+`torch.compile` guards on `backed` dynamic shapes and does not provide a
+guarantee that no guards will be added to them. User code, dynamo,
+inductor, and autograd all can add guards. Moreover, for 0/1
+specializations, backed symbols are specialized unconditionally to 0, 1,
+or >=2 even without encountering a branching on those ranges.
+
+On the contrary, `unbacked` dynamic shapes are guaranteed not to be guarded
+on and are not 0/1 specialized. However, there is a possibility of
+throwing a data dependent error when a branch that requires their value is
+encountered and no explicit unbacked handling is defined. The framework is
+converging to a state where it won't throw DDE but rather pick general
+paths. One downside of using unbacked is missed optimization opportunities
+due to either perf bugs or picking general paths, also using a fixed
+non-example input-based hint (this will be fixed soon with override_hint
+API). An example of picking general paths is assuming input not contiguous
+in functions call contiguous() and reshape() when can't be symbolically proven
+with a change of introducing a clone.
+
+`backed_size_oblivious` is a flag that enables treating backed symbols as
+unbacked wherever explicit handling for unbacked is defined. With this
+mode, 0/1 specializations are mostly avoided in framework code and the
+default 0/1 specialization does not happen. However, there is still no
+guarantee that torch.compile won't guard, especially due to user code or
+custom passes. `backed_size_oblivious` is experimental in PyTorch compile
+and could be deprecated. That said, it's a safer option to use than
+`backed` and the probability of reducing performance is lower than
+`unbacked`.
+
+### Configuring Dynamic Shapes
+
+The `DynamicShapesConfig` allows you to control the dynamic shapes behavior by
+setting the `type` field. You can choose between three modes:
+`BACKED`(default), `UNBACKED` , and `BACKED_SIZE_OBLIVIOUS`.
+
+#### Offline Inference Example (Using LLM class)
+
+When using the `LLM` class for offline inference, you can configure dynamic
+shapes through the `compilation_config` parameter:
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationConfig, DynamicShapesConfig, DynamicShapesType
+
+# Example: Using backed_size_oblivious (experimental, safer than backed)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.BACKED_SIZE_OBLIVIOUS
+        )
+    )
+)
+
+# Example: Using unbacked (strongest guarantee against guards)
+llm = LLM(
+    model="meta-llama/Llama-3.2-1B",
+    compilation_config=CompilationConfig(
+        dynamic_shapes_config=DynamicShapesConfig(
+            type=DynamicShapesType.UNBACKED
+        )
+    )
+)
+
+# Generate outputs
+prompts = ["Hello, my name is", "The future of AI is"]
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+outputs = llm.generate(prompts, sampling_params)
+```
+
+#### Online Serving Example (Using vllm serve)
+
+When using `vllm serve` for online serving, you can configure dynamic shapes
+through the `--compilation-config` flag:
+
+```bash
+# Example: Using unbacked
+vllm serve meta-llama/Llama-3.2-1B \
+  --compilation-config '{"dynamic_shapes_config": {"type": "unbacked"}}'
+
+
+# Alternative: Using dot notation (simpler for single values)
+vllm serve meta-llama/Llama-3.2-1B -cc.dynamic_shapes_config.type=unbacked
+```
+
+#### Choosing the Right Mode
+
+- **BACKED** (default): Use when you're willing to accept potential unsafe dropping of guards
+for maximal performance. Guard could be unsoundly added and then ignored.
+
+- **UNBACKED**  Use when you need the strongest guarantee against guards.
+  This is the most conservative option but may miss some optimization opportunities.
+
+- **BACKED_SIZE_OBLIVIOUS**: Use when you want a balance between avoiding guards
+  and performance. This experimental mode is safer than BACKED but still not as
+  conservative as UNBACKED.
+
 ## Python Code Compilation
 
 In the very verbose logs, we can see:
@@ -122,7 +225,7 @@ When all the shapes are known, `torch.compile` can compare different configs, an
       triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
       triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
-      mm 0.0160 ms 81.6% 
+      mm 0.0160 ms 81.6%
       triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
       triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
       triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
diff --git a/docs/features/README.md b/docs/features/README.md
index ad9de9ff8f36..5faf3768f321 100644
--- a/docs/features/README.md
+++ b/docs/features/README.md
@@ -59,20 +59,23 @@ th:not(:first-child) {
 
 ### Feature x Hardware
 
-| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | TPU | Intel GPU |
-|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----| ------------|
-| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅ | ✅        |
-| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
-| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | ❌ | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
-| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ❌ | ✅        |
-| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
-| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ❌ | ✅        |
-| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ❌ | ✅        |
-| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ❌ | ✅        |
-| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | [❌](https://github.com/vllm-project/vllm/issues/25097) | ✅       |
+| Feature                                                   | Volta               | Turing    | Ampere    | Ada    | Hopper     | CPU                | AMD    | Intel GPU |
+|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------| ------------|
+| [CP](../configuration/optimization.md#chunked-prefill)                                     | [❌](https://github.com/vllm-project/vllm/issues/2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [APC](automatic_prefix_caching.md)                        | [❌](https://github.com/vllm-project/vllm/issues/3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [LoRA](lora.md)                                           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| [SD](spec_decode.md)                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26963)       |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     | [❌](https://github.com/vllm-project/vllm/issues/26970)        |
+| [pooling](../models/pooling_models.md)                    | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     | ✅        |
+| [mm](multimodal_inputs.md)                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | [🟠](https://github.com/vllm-project/vllm/issues/26965)       |
+| [prompt-embeds](prompt_embeds.md)                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     | ✅       |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     | ✅        |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](https://github.com/vllm-project/vllm/issues/8477) | ✅     | ✅        |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     | ✅        |
+
+!!! note
+    For information on feature support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation.
diff --git a/docs/features/custom_arguments.md b/docs/features/custom_arguments.md
index 7a650d0e79c2..728a2c89901d 100644
--- a/docs/features/custom_arguments.md
+++ b/docs/features/custom_arguments.md
@@ -5,7 +5,7 @@ You can use vLLM *custom arguments* to pass in arguments which are not part of t
 Custom arguments can be useful if, for example, you want to use a [custom logits processor](./custom_logitsprocs.md) without modifying the vLLM source code.
 
 !!! note
-    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise invalid custom arguments can cause unexpected behaviour.
+    Make sure your custom logits processor have implemented `validate_params` for custom arguments. Otherwise, invalid custom arguments can cause unexpected behaviour.
 
 ## Offline Custom Arguments
 
diff --git a/docs/features/custom_logitsprocs.md b/docs/features/custom_logitsprocs.md
index 52fcc44efacc..5ddef9db1611 100644
--- a/docs/features/custom_logitsprocs.md
+++ b/docs/features/custom_logitsprocs.md
@@ -71,7 +71,7 @@ Logits processor `update_state()` implementations should assume the following mo
 
         * **"Condense" the batch to be contiguous:** starting with the lowest-index empty slot (which was caused by a Remove), apply a Unidirectional Move from the current highest non-empty slot in the batch to fill the empty slot. Proceed with additional Unidirectional Move operations in order of increasing empty slot destination index and decreasing non-empty slot source index until the batch is contiguous
 
-        * **Shrink the batch:** a side-effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
+        * **Shrink the batch:** a side effect of condensing the batch is that empty slots resulting from Remove operations are grouped in a contiguous block at the end of the batch array. Thus, after condensing, update `BatchUpdate.batch_size` to reflect the number of non-empty slots
 
 5. Reorder the batch for improved efficiency. Depending on the attention backend implementation and the current characteristics of the batch, zero or more Swap Move operations may be applied to reorder the batch
 
@@ -286,7 +286,7 @@ Once you have created a custom subclass (like `WrappedPerReqLogitsProcessor`) wh
 
 ## Ways to Load Your Custom Logits Processor in vLLM
 
-Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits logits processors cannot be loaded on-demand for individual requests.
+Logits processors are loaded at initialization. Critically, the set of loaded logits processors cannot be modified after the vLLM engine finishes loading, and new logits processors cannot be loaded on-demand for individual requests.
 
 This section details different ways of making your logits processor visible to vLLM and triggering vLLM to load your logits processor.
 
@@ -438,7 +438,7 @@ The examples below show how a user would pass a custom argument (`target_token`)
 
 ## Best Practices for Writing Custom Logits Processors
 
-Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus it is important to implement these methods efficiently.
+Once vLLM loads a logits processor during initialization, then vLLM will invoke `update_state()` and `apply()` against that logits processor in every engine step. Both methods operate on all requests which currently reside in the vLLM persistent batch. Thus, it is important to implement these methods efficiently.
 
 * Write efficient `apply()` and `update_state()` implementations in light of the fact that logits processors operate at batch granularity
     * For example, you may be able to use efficient vectorized operations to implement `apply()` or update internal state vectors in `update_state()`
@@ -465,4 +465,4 @@ Once vLLM loads a logits processor during initialization, then vLLM will invoke
 
     * **Note:** for wrapped per-request logits processors, the `AdapterLogitsProcessor` base-class handles this by default
 
-* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
+* `is_argmax_invariant()` can be hard-coded to `True` or `False` if the logits processor has consistent behavior. However, the argmax invariance may also be determined programmatically (i.e. if your logits processor is user-customizable in some way that impacts whether the logits processor is argmax invariant). For this reason, `is_argmax_invariant()` is not a class method
diff --git a/docs/features/disagg_prefill.md b/docs/features/disagg_prefill.md
index 3e8cb87e37d3..fd4f249f2ec6 100644
--- a/docs/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -91,6 +91,6 @@ Disaggregated prefilling is highly related to infrastructure, so vLLM relies on
 
 We recommend three ways of implementations:
 
-- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
+- **Fully-customized connector**: Implement your own `Connector`, and call third-party libraries to send and receive KV caches, and many many more (like editing vLLM's model input to perform customized prefilling, etc.). This approach gives you the most control, but at the risk of being incompatible with future vLLM versions.
 - **Database-like connector**: Implement your own `LookupBuffer` and support the `insert` and `drop_select` APIs just like SQL.
 - **Distributed P2P connector**: Implement your own `Pipe` and support the `send_tensor` and `recv_tensor` APIs, just like `torch.distributed`.
diff --git a/docs/features/interleaved_thinking.md b/docs/features/interleaved_thinking.md
new file mode 100644
index 000000000000..7343324b4849
--- /dev/null
+++ b/docs/features/interleaved_thinking.md
@@ -0,0 +1,118 @@
+# Interleaved Thinking
+
+## Introduction
+
+Interleaved thinking allows models to reason between tool calls, enabling more sophisticated decision-making after receiving tool results. This feature helps models chain multiple tool calls with reasoning steps in between and make nuanced decisions based on intermediate results.
+
+Important: Interleaved thinking increases token usage and response latency. Consider your budget and performance requirements when enabling this feature.
+
+## How Interleaved Thinking Works
+
+With interleaved thinking, the model can:
+
+- Reason about the results of a tool call before deciding what to do next
+- Chain multiple tool calls with reasoning steps in between
+- Make more nuanced decisions based on intermediate results
+- Provide transparent reasoning for its tool selection process
+
+## Supported Models
+
+vLLM currently supports the following interleaved thinking models:
+
+| Model Series | Reasoning Parser Name |
+|--------------|-----------------------|
+| moonshotai/Kimi-K2-Thinking    |  kimi_k2  |
+| MiniMaxAI/MiniMax-M2           |  minimax_m2  |
+
+## Example Usage
+
+To use interleaved thinking with tool calls, specify a model that supports this feature and enable tool calls in your chat completion request. Here's an example:
+
+??? code
+
+    ```python
+    """
+    vllm serve MiniMaxAI/MiniMax-M2 \
+      --tensor-parallel-size 4 \
+      --tool-call-parser minimax_m2 \
+      --reasoning-parser minimax_m2 \
+      --enable-auto-tool-choice
+    """
+    import json
+    
+    from openai import OpenAI
+    
+    client = OpenAI(base_url="http://localhost:8000/v1",     api_key="dummy")
+    
+    
+    def get_current_weather(location: str, unit: "str"):
+        """Get the current weather in a given location"""
+        if unit == "celsius":
+            return f"The current temperature in {location} is 22°C."
+        else:
+            return f"The current temperature in {location} is 72°F."
+    
+    
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get the current weather in a given     location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and state, e.g.,     'San Francisco, CA'",
+                        },
+                        "unit": {"type": "string", "enum":     ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location", "unit"],
+                },
+            },
+        }
+    ]
+    messages = [{"role": "user", "content": "What's the weather in Fahrenheit like in San Francisco?"}]
+    response = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    
+    tool_call = response.choices[0].message.tool_calls[0].function
+    
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": response.choices[0].message.tool_calls,
+            "reasoning": response.choices[0].message.reasoning, # append reasoning
+        }
+    )
+    
+    # Simulate tool execution
+    available_tools = {"get_weather": get_current_weather}
+    
+    completion_tool_calls = response.choices[0].message.tool_calls
+    for call in completion_tool_calls:
+        tool_to_call = available_tools[call.function.name]
+        args = json.loads(call.function.arguments)
+        result = tool_to_call(**args)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+    response_2 = client.chat.completions.create(
+        model=client.models.list().data[0].id,
+        messages=messages,
+        tools=tools,
+        tool_choice="auto",
+    )
+    print(response_2.choices[0].message.content)
+    ```
+This example demonstrates how to set up interleaved thinking with tool calls using a weather retrieval function. The model reasons about the tool results before generating the final response.
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 3a85b52d89b6..d42a3cef76bd 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -4,7 +4,7 @@ This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09
 
 LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
 
-Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
+Adapters can be efficiently served on a per-request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
 
 ```python
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index cde2ec165712..4656ee43ea25 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -365,6 +365,8 @@ You must enable this feature via `enable_mm_embeds=True`.
     The vLLM engine may crash if incorrect shape of embeddings is passed.
     Only enable this flag for trusted users!
 
+#### Image Embeddings
+
 ??? code
 
     ```python
@@ -441,6 +443,36 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
         print(generated_text)
     ```
 
+#### Audio Embeddings
+
+You can pass pre-computed audio embeddings similar to image embeddings:
+
+??? code
+
+    ```python
+    from vllm import LLM
+    import torch
+
+    # Enable audio embeddings support
+    llm = LLM(model="fixie-ai/ultravox-v0_5-llama-3_2-1b", enable_mm_embeds=True)
+
+    # Refer to the HuggingFace repo for the correct format to use
+    prompt = "USER: <audio>\nWhat is in this audio?\nASSISTANT:"
+
+    # Load pre-computed audio embeddings
+    # torch.Tensor of shape (1, audio_feature_size, hidden_size of LM)
+    audio_embeds = torch.load(...)
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {"audio": audio_embeds},
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+    ```
+
 ## Online Serving
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
@@ -483,7 +515,7 @@ Then, you can use the OpenAI client as follows:
     )
 
     # Single-image input inference
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
     chat_response = client.chat.completions.create(
         model="microsoft/Phi-3.5-vision-instruct",
diff --git a/docs/features/nixl_connector_usage.md b/docs/features/nixl_connector_usage.md
index 1ce038f4d652..f0e25e31aa0b 100644
--- a/docs/features/nixl_connector_usage.md
+++ b/docs/features/nixl_connector_usage.md
@@ -158,7 +158,7 @@ python tests/v1/kv_connector/nixl_integration/toy_proxy_server.py \
 
 ## Experimental Feature
 
-### Heterogenuous KV Layout support
+### Heterogeneous KV Layout support
 
 Support use case: Prefill with 'HND' and decode with 'NHD' with experimental configuration
 
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
index 74f005c496ee..7b5287bad3bb 100644
--- a/docs/features/quantization/README.md
+++ b/docs/features/quantization/README.md
@@ -43,24 +43,27 @@ th:not(:first-child) {
 }
 </style>
 
-| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   | Google TPU   |
-|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|--------------|
-| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
-| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        | ❌           |
-| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        | ✅︎           |
-| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
-| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        | ❌           |
-| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        | ❌           |
-| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        | ❌           |
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | Intel Gaudi | x86 CPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-------------|-----------|
+| AWQ                   | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ❌         | ✅︎        |
+| Marlin (GPTQ/AWQ/FP8) | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| INT8 (W8A8)           | ❌      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ✅︎        |
+| FP8 (W8A8)            | ❌      | ❌       | ❌       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        |
+| BitBLAS               | ✅︎      | ✅       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| BitBLAS (GPTQ)        | ❌      | ❌       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌          | ❌         | ❌        |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎         | ❌          | ❌         | ❌        |
+| INC (W8A8)            | ❌      | ❌       | ❌       | ❌    | ❌       | ❌         | ❌          | ✅︎         | ❌        |
 
 - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
 - ✅︎ indicates that the quantization method is supported on the specified hardware.
 - ❌ indicates that the quantization method is not supported on the specified hardware.
 
+!!! note
+    For information on quantization support on Google TPU, please refer to the [TPU-Inference Recommended Models and Features](https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/) documentation.
+
 !!! note
     This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index 0c5111fb8af0..d4a6176b236f 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -60,7 +60,7 @@ Since simple RTN does not require data for weight quantization and the activatio
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import QuantizationModifier
 
     # Configure the simple PTQ quantization
diff --git a/docs/features/quantization/inc.md b/docs/features/quantization/inc.md
index 5e86e9388f32..9875bc44c914 100644
--- a/docs/features/quantization/inc.md
+++ b/docs/features/quantization/inc.md
@@ -22,9 +22,6 @@ export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxab
 vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --tensor_paralel_size 8
 ```
 
-!!! tip
-    If you are just prototyping or testing your model with FP8, you can use the `VLLM_SKIP_WARMUP=true` environment variable to disable the warmup stage, which can take a long time. However, we do not recommend disabling this feature in production environments as it causes a significant performance drop.
-
 !!! tip
     When using FP8 models, you may experience timeouts caused by the long compilation time of FP8 operations. To mitigate this problem, you can use the below environment variables:
     `VLLM_ENGINE_ITERATION_TIMEOUT_S` - to adjust the vLLM server timeout. You can set the value in seconds, e.g., 600 equals 10 minutes.
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index 035e7ea291f9..9752039097d6 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -80,7 +80,7 @@ Now, apply the quantization algorithms:
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import GPTQModifier
     from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index ec8a77f74ffe..701ca6378cb1 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -87,7 +87,7 @@ Now, apply the quantization algorithms:
 ??? code
 
     ```python
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
     from llmcompressor.modifiers.quantization import GPTQModifier
     from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
 
diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
index 56cf057678be..d26a5e217f31 100644
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -78,7 +78,7 @@ Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models
     ```python
     from datasets import load_dataset
     from transformers import AutoModelForCausalLM, AutoTokenizer
-    from llmcompressor.transformers import oneshot
+    from llmcompressor import oneshot
 
     # Select model and load it
     MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
index be0702f4c9e1..c54d7d225199 100644
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -298,7 +298,7 @@ There are two steps to generate and deploy a mixed precision model quantized wit
 
 Firstly, the layerwise mixed-precision configuration for a given LLM model is searched and then quantized using AMD Quark. We will provide a detailed tutorial with Quark APIs later.
 
-As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benifits. They are:
+As examples, we provide some ready-to-use quantized mixed precision model to show the usage in vLLM and the accuracy benefits. They are:
 
 - amd/Llama-2-70b-chat-hf-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
 - amd/Mixtral-8x7B-Instruct-v0.1-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8
@@ -306,7 +306,7 @@ As examples, we provide some ready-to-use quantized mixed precision model to sho
 
 ### 2. inference the quantized mixed precision model in vLLM
 
-Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follow:
+Models quantized with AMD Quark using mixed precision can natively be reload in vLLM, and e.g. evaluated using lm-evaluation-harness as follows:
 
 ```bash
 lm_eval --model vllm \
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
index 5f26c7cf182b..08a0dd69efa9 100644
--- a/docs/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -216,14 +216,13 @@ You can add a new `ReasoningParser` similar to [vllm/reasoning/deepseek_r1_reaso
     # import the required packages
 
     from vllm.reasoning import ReasoningParser, ReasoningParserManager
-    from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                                DeltaMessage)
+    from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 
     # define a reasoning parser and register it to vllm
     # the name list in register_module can be used
     # in --reasoning-parser.
     class ExampleParser(ReasoningParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
             super().__init__(tokenizer)
 
         def extract_reasoning_streaming(
diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md
index e38627c70788..7d52891bea7b 100644
--- a/docs/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -7,7 +7,7 @@ This document shows you some examples of the different options that are
 available to generate structured outputs.
 
 !!! warning
-    If you are still using the following deprecated API fields, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
+    If you are still using the following deprecated API fields which were removed in v0.12.0, please update your code to use `structured_outputs` as demonstrated in the rest of this document:
 
     - `guided_json` -> `{"structured_outputs": {"json": ...}}` or `StructuredOutputsParams(json=...)`
     - `guided_regex` -> `{"structured_outputs": {"regex": ...}}` or `StructuredOutputsParams(regex=...)`
diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md
index 7e6c69e717db..b6dfbf10b456 100644
--- a/docs/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -142,7 +142,7 @@ Flags: `--tool-call-parser hermes`
 Supported models:
 
 * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
-* Additional mistral function-calling models are compatible as well.
+* Additional Mistral function-calling models are compatible as well.
 
 Known issues:
 
@@ -158,12 +158,25 @@ Known issues:
 
 Recommended flags:
 
-1. To use [mistral-common](https://github.com/mistralai/mistral-common) the official Mistral tokenization backend:
+1. To use the official Mistral AI's format:
 
-    `--tokenizer_mode mistral --config_format mistral --load_format mistral --tool-call-parser mistral`
+    `--tool-call-parser mistral`
 
-2. To use the default Transformers tokenization backend:
-    `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+2. To use the Transformers format when available:
+
+    `--tokenizer_mode hf --config_format hf --load_format hf --tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
+!!! note
+    Models officially released by Mistral AI have two possible formats:
+
+    1. The official format that is used by default with `auto` or `mistral` arguments:
+
+        `--tokenizer_mode mistral --config_format mistral --load_format mistral`
+        This format uses [mistral-common](https://github.com/mistralai/mistral-common), the Mistral AI's tokenizer backend.
+
+    2. The Transformers format, when available, that is used with `hf` arguments:
+
+        `--tokenizer_mode hf --config_format hf --load_format hf --chat-template examples/tool_chat_template_mistral_parallel.jinja`
 
 ### Llama Models (`llama3_json`)
 
@@ -358,7 +371,8 @@ Olmo 3 models output tool calls in a format that is very similar to the one expe
 
 Supported models:
 
-* TODO (will be updated after Olmo 3 release)
+* `allenai/Olmo-3-7B-Instruct`
+* `allenai/Olmo-3-32B-Think`
 
 Flags: `--tool-call-parser olmo3`
 
@@ -408,7 +422,7 @@ Here is a summary of a plugin file:
     # in --tool-call-parser. you can define as many
     # tool parsers as you want here.
     class ExampleToolParser(ToolParser):
-        def __init__(self, tokenizer: AnyTokenizer):
+        def __init__(self, tokenizer: TokenizerLike):
             super().__init__(tokenizer)
 
         # adjust request. e.g.: set skip special tokens
diff --git a/docs/getting_started/installation/cpu.apple.inc.md b/docs/getting_started/installation/cpu.apple.inc.md
index 7e2ed55008a5..4dc707d5f9a1 100644
--- a/docs/getting_started/installation/cpu.apple.inc.md
+++ b/docs/getting_started/installation/cpu.apple.inc.md
@@ -28,10 +28,15 @@ After installation of XCode and the Command Line Tools, which include Apple Clan
 ```bash
 git clone https://github.com/vllm-project/vllm.git
 cd vllm
-uv pip install -r requirements/cpu.txt
+uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
 uv pip install -e .
 ```
 
+!!! tip
+    The `--index-strategy unsafe-best-match` flag is needed to resolve dependencies across multiple package indexes (PyTorch CPU index and PyPI). Without this flag, you may encounter `typing-extensions` version conflicts.
+    
+    The term "unsafe" refers to the package resolution strategy, not security. By default, `uv` only searches the first index where a package is found to prevent dependency confusion attacks. This flag allows `uv` to search all configured indexes to find the best compatible versions. Since both PyTorch and PyPI are trusted package sources, using this strategy is safe and appropriate for vLLM installation.
+
 !!! note
     On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which is currently the only supported device.
 
diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
index e8bfca0e5e88..18dc6d19434b 100644
--- a/docs/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -46,10 +46,23 @@ vLLM is a Python library that supports the following CPU variants. Select your C
 
 ### Pre-built wheels
 
-Currently, there are no pre-built CPU wheels.
+Please refer to the instructions for [pre-built wheels on GPU](./gpu.md#pre-built-wheels).
+
+When specifying the index URL, please make sure to use the `cpu` variant subdirectory.
+For example, the nightly build index is: `https://wheels.vllm.ai/nightly/cpu/`.
 
 ### Build wheel from source
 
+#### Set up using Python-only build (without compilation) {#python-only-build}
+
+Please refer to the instructions for [Python-only build on GPU](./gpu.md#python-only-build), and replace the build commands with:
+
+```bash
+VLLM_USE_PRECOMPILED=1 VLLM_PRECOMPILED_WHEEL_VARIANT=cpu VLLM_TARGET_DEVICE=cpu uv pip install --editable .
+```
+
+#### Full build (with compilation) {#full-build}
+
 === "Intel/AMD x86"
 
     --8<-- "docs/getting_started/installation/cpu.x86.inc.md:build-wheel-from-source"
@@ -97,14 +110,13 @@ Currently, there are no pre-built CPU wheels.
 - `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists, `auto` (by default), or `nobind` (to disable binding to individual CPU cores and to inherit user-defined OpenMP variables). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. If set to `nobind`, the number of OpenMP threads is determined by the standard `OMP_NUM_THREADS` environment variable.
 - `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`.
 - `CPU_VISIBLE_MEMORY_NODES`: specify visible NUMA memory nodes for vLLM CPU workers, similar to ```CUDA_VISIBLE_DEVICES```. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. The variable provides more control for the auto thread-binding feature, such as masking nodes and changing nodes binding sequence.
-- `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False).
 - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False).
 
 ## FAQ
 
 ### Which `dtype` should be used?
 
-- Currently vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
+- Currently, vLLM CPU uses model default settings as `dtype`. However, due to unstable float16 support in torch CPU, it is recommended to explicitly set `dtype=bfloat16` if there are any performance or accuracy problem.  
 
 ### How to launch a vLLM service on CPU?
 
@@ -191,10 +203,9 @@ vLLM CPU supports data parallel (DP), tensor parallel (TP) and pipeline parallel
     - GPTQ (x86 only)
     - compressed-tensor INT8 W8A8 (x86, s390x)
 
-### (x86 only) What is the purpose of `VLLM_CPU_MOE_PREPACK` and `VLLM_CPU_SGL_KERNEL`?
+### (x86 only) What is the purpose of `VLLM_CPU_SGL_KERNEL`?
 
 - Both of them require `amx` CPU flag.
-    - `VLLM_CPU_MOE_PREPACK` can provide better performance for MoE models
     - `VLLM_CPU_SGL_KERNEL` can provide better performance for MoE models and small-batch scenarios.
 
 ### Why do I see `get_mempolicy: Operation not permitted` when running in Docker?
diff --git a/docs/getting_started/installation/cpu.s390x.inc.md b/docs/getting_started/installation/cpu.s390x.inc.md
index 442c2b4ec64e..c2163139a7c5 100644
--- a/docs/getting_started/installation/cpu.s390x.inc.md
+++ b/docs/getting_started/installation/cpu.s390x.inc.md
@@ -2,7 +2,7 @@
 
 vLLM has experimental support for s390x architecture on IBM Z platform. For now, users must build from source to natively run on IBM Z platform.
 
-Currently the CPU implementation for s390x architecture supports FP32 datatype only.
+Currently, the CPU implementation for s390x architecture supports FP32 datatype only.
 
 !!! warning
     There are no pre-built wheels or images for this device, so you must build vLLM from source.
diff --git a/docs/getting_started/installation/cpu.x86.inc.md b/docs/getting_started/installation/cpu.x86.inc.md
index 00f3b726b1a0..310f179cb89c 100644
--- a/docs/getting_started/installation/cpu.x86.inc.md
+++ b/docs/getting_started/installation/cpu.x86.inc.md
@@ -83,7 +83,7 @@ uv pip install dist/*.whl
 !!! example "Troubleshooting"
     - **NumPy ≥2.0 error**: Downgrade using `pip install "numpy<2.0"`.
     - **CMake picks up CUDA**: Add `CMAKE_DISABLE_FIND_PACKAGE_CUDA=ON` to prevent CUDA detection during CPU builds, even if CUDA is installed.
-    - `AMD` requies at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
+    - `AMD` requires at least 4th gen processors (Zen 4/Genoa) or higher to support [AVX512](https://www.phoronix.com/review/amd-zen4-avx512) to run vLLM on CPU.
     - If you receive an error such as: `Could not find a version that satisfies the requirement torch==X.Y.Z+cpu+cpu`, consider updating [pyproject.toml](https://github.com/vllm-project/vllm/blob/main/pyproject.toml) to help pip resolve the dependency.
     ```toml title="pyproject.toml"
     [build-system]
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index b2d0d64a2d35..ad26672f8092 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -26,43 +26,50 @@ uv pip install vllm --torch-backend=auto
 
 ??? console "pip"
     ```bash
-    # Install vLLM with CUDA 12.8.
-    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+    # Install vLLM with CUDA 12.9.
+    pip install vllm --extra-index-url https://download.pytorch.org/whl/cu129
     ```
 
-We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu128`), set `--torch-backend=cu128` (or `UV_TORCH_BACKEND=cu128`). If this doesn't work, try running `uv self update` to update `uv` first.
 
 !!! note
     NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
 
-As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
+As of now, vLLM's binaries are compiled with CUDA 12.9 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 13.0, and public PyTorch release versions:
 
 ```bash
-# Install vLLM with a specific CUDA version (e.g., 11.8 or 12.6).
+# Install vLLM with a specific CUDA version (e.g., 13.0).
 export VLLM_VERSION=$(curl -s https://api.github.com/repos/vllm-project/vllm/releases/latest | jq -r .tag_name | sed 's/^v//')
-export CUDA_VERSION=118 # or 126
-uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
+export CUDA_VERSION=130 # or other
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu${CUDA_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu${CUDA_VERSION}
 ```
 
 #### Install the latest code
 
-LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on an x86 platform with CUDA 12 for every commit since `v0.5.3`.
+LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for every commit since `v0.5.3` on <https://wheels.vllm.ai/nightly>. There are multiple indices that could be used:
+
+* `https://wheels.vllm.ai/nightly`: the default variant (CUDA with version specified in `VLLM_MAIN_CUDA_VERSION`) built with the last commit on the `main` branch. Currently it is CUDA 12.9.
+* `https://wheels.vllm.ai/nightly/<variant>`: all other variants. Now this includes `cu130`, and `cpu`. The default variant (`cu129`) also has a subdirectory to keep consistency.
+
+To install from nightly index, run:
 
 ```bash
 uv pip install -U vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/nightly
+    --extra-index-url https://wheels.vllm.ai/nightly # add variant subdirectory here if needed
 ```
 
-??? console "pip"
+!!! warning "`pip` caveat"
+
+    Using `pip` to install from nightly indices is _not supported_, because `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. In contrast, `uv` gives the extra index [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes).
+
+    If you insist on using `pip`, you have to specify the full URL of the wheel file (which can be obtained from the web page).
+
     ```bash
-    pip install -U vllm \
-        --pre \
-        --extra-index-url https://wheels.vllm.ai/nightly
+    pip install -U https://wheels.vllm.ai/nightly/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # current nightly build (the filename will change!)
+    pip install -U https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-0.11.2.dev399%2Bg3c7461c18-cp38-abi3-manylinux_2_31_x86_64.whl # from specific commit
     ```
 
-    `--pre` is required for `pip` to consider pre-released versions.
-
 ##### Install specific revisions
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
@@ -71,33 +78,13 @@ If you want to access the wheels for previous commits (e.g. to bisect the behavi
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
 uv pip install vllm \
     --torch-backend=auto \
-    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} # add variant subdirectory here if needed
 ```
 
-The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
-
-??? note "pip"
-    If you want to access the wheels for previous commits (e.g. to bisect the behavior change,
-    performance regression), due to the limitation of `pip`, you have to specify the full URL of the
-    wheel file by embedding the commit hash in the URL:
-
-    ```bash
-    export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-    pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-    ```
-
-    Note that the wheels are built with Python 3.8 ABI (see [PEP
-    425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible
-    with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a
-    placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in
-    the wheel metadata (the wheels listed in the extra index url have correct versions). Although we
-    don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the
-    wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
 # --8<-- [end:pre-built-wheels]
 # --8<-- [start:build-wheel-from-source]
 
-#### Set up using Python-only build (without compilation)
+#### Set up using Python-only build (without compilation) {#python-only-build}
 
 If you only need to change Python code, you can build and install vLLM without compilation. Using `uv pip`'s [`--editable` flag](https://docs.astral.sh/uv/pip/packages/#editable-packages), changes you make to the code will be reflected when you run vLLM:
 
@@ -121,18 +108,24 @@ This command will do the following:
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
 ```bash
-export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+export VLLM_PRECOMPILED_WHEEL_COMIMT=$(git rev-parse HEAD~1) # or earlier commit on main
+export VLLM_USE_PRECOMPILED=1
 uv pip install --editable .
 ```
 
+There are more environment variables to control the behavior of Python-only build:
+
+* `VLLM_PRECOMPILED_WHEEL_LOCATION`: specify the exact wheel URL or local file path of a pre-compiled wheel to use. All other logic to find the wheel will be skipped.
+* `VLLM_PRECOMPILED_WHEEL_COMMIT`: override the commit hash to download the pre-compiled wheel. It can be `nightly` to use the last **already built** commit on the main branch.
+* `VLLM_PRECOMPILED_WHEEL_VARIANT`: specify the variant subdirectory to use on the nightly index, e.g., `cu129`, `cpu`. If not specified, the CUDA variant with `VLLM_MAIN_CUDA_VERSION` will be tried, then fallback to the default variant on the remote index.
+
 You can find more information about vLLM's wheels in [Install the latest code](#install-the-latest-code).
 
 !!! note
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
     It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [Install the latest code](#install-the-latest-code) for instructions on how to install a specified wheel.
 
-#### Full build (with compilation)
+#### Full build (with compilation) {#full-build}
 
 If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
 
@@ -158,10 +151,7 @@ uv pip install -e .
 
 ##### Use an existing PyTorch installation
 
-There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, e.g.:
-
-- Building vLLM with PyTorch nightly or a custom PyTorch build.
-- Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run `uv pip install --index-url https://download.pytorch.org/whl/nightly/cu128 torch torchvision torchaudio` to [install PyTorch nightly](https://pytorch.org/get-started/locally/) and then build vLLM on top of it.
+There are scenarios where the PyTorch dependency cannot be easily installed with `uv`, for example, when building vLLM with non-default PyTorch builds (like nightly or a custom build).
 
 To build vLLM using an existing PyTorch installation:
 
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index bc7508b29475..fb750f449985 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -52,7 +52,7 @@ vLLM is a Python library that supports the following GPU variants. Select your G
 
     --8<-- "docs/getting_started/installation/gpu.xpu.inc.md:set-up-using-python"
 
-### Pre-built wheels
+### Pre-built wheels {#pre-built-wheels}
 
 === "NVIDIA CUDA"
 
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index cfc8b4d9838a..94920dc5306b 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -283,10 +283,10 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options:
 
-- On NVIDIA CUDA: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
+- On NVIDIA CUDA: `FLASH_ATTN` or `FLASHINFER`.
 - On AMD ROCm: `TRITON_ATTN`, `ROCM_ATTN`, `ROCM_AITER_FA` or `ROCM_AITER_UNIFIED_ATTN`.
 
-For AMD ROCm, you can futher control the specific Attention implementation using the following variables:
+For AMD ROCm, you can further control the specific Attention implementation using the following variables:
 
 - Triton Unified Attention: `VLLM_ROCM_USE_AITER=0 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
 - AITER Unified Attention: `VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=0 VLLM_ROCM_USE_AITER_MHA=0`
diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py
index ce1c5c53cf35..4ae64a6e4bfc 100644
--- a/docs/mkdocs/hooks/generate_argparse.py
+++ b/docs/mkdocs/hooks/generate_argparse.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import importlib
+import importlib.metadata
+import importlib.util
 import logging
 import sys
 import traceback
-from argparse import SUPPRESS, HelpFormatter
+from argparse import SUPPRESS, Action, HelpFormatter
+from collections.abc import Iterable
+from importlib.machinery import ModuleSpec
 from pathlib import Path
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
 from unittest.mock import MagicMock, patch
 
 from pydantic_core import core_schema
@@ -19,6 +22,11 @@
 sys.path.insert(0, str(ROOT_DIR))
 
 
+def mock_if_no_torch(mock_module: str, mock: MagicMock):
+    if not importlib.util.find_spec("torch"):
+        sys.modules[mock_module] = mock
+
+
 # Mock custom op code
 class MockCustomOp:
     @staticmethod
@@ -29,18 +37,21 @@ def decorator(cls):
         return decorator
 
 
-noop = lambda *a, **k: None
-sys.modules["vllm._C"] = MagicMock()
-sys.modules["vllm.model_executor.custom_op"] = MagicMock(CustomOp=MockCustomOp)
-sys.modules["vllm.utils.torch_utils"] = MagicMock(direct_register_custom_op=noop)
+mock_if_no_torch("vllm._C", MagicMock())
+mock_if_no_torch("vllm.model_executor.custom_op", MagicMock(CustomOp=MockCustomOp))
+mock_if_no_torch(
+    "vllm.utils.torch_utils", MagicMock(direct_register_custom_op=lambda *a, **k: None)
+)
+
 
 # Mock any version checks by reading from compiled CI requirements
 with open(ROOT_DIR / "requirements/test.txt") as f:
     VERSIONS = dict(line.strip().split("==") for line in f if "==" in line)
 importlib.metadata.version = lambda name: VERSIONS.get(name) or "0.0.0"
 
+
 # Make torch.nn.Parameter safe to inherit from
-sys.modules["torch.nn"] = MagicMock(Parameter=object)
+mock_if_no_torch("torch.nn", MagicMock(Parameter=object))
 
 
 class PydanticMagicMock(MagicMock):
@@ -49,37 +60,43 @@ class PydanticMagicMock(MagicMock):
     def __init__(self, *args, **kwargs):
         name = kwargs.pop("name", None)
         super().__init__(*args, **kwargs)
-        self.__spec__ = importlib.machinery.ModuleSpec(name, None)
+        self.__spec__ = ModuleSpec(name, None)
 
     def __get_pydantic_core_schema__(self, source_type, handler):
         return core_schema.any_schema()
 
 
-def auto_mock(module, attr, max_mocks=100):
+def auto_mock(module_name: str, attr: str, max_mocks: int = 100):
     """Function that automatically mocks missing modules during imports."""
-    logger.info("Importing %s from %s", attr, module)
+    logger.info("Importing %s from %s", attr, module_name)
+
     for _ in range(max_mocks):
         try:
+            module = importlib.import_module(module_name)
+
             # First treat attr as an attr, then as a submodule
-            return getattr(
-                importlib.import_module(module),
-                attr,
-                importlib.import_module(f"{module}.{attr}"),
-            )
+            if hasattr(module, attr):
+                return getattr(module, attr)
+
+            return importlib.import_module(f"{module_name}.{attr}")
         except ModuleNotFoundError as e:
+            assert e.name is not None
             logger.info("Mocking %s for argparse doc generation", e.name)
             sys.modules[e.name] = PydanticMagicMock(name=e.name)
-        except Exception as e:
-            logger.warning("Failed to import %s.%s: %s", module, attr, e)
+        except Exception:
+            logger.exception("Failed to import %s.%s: %s", module_name, attr)
 
     raise ImportError(
-        f"Failed to import {module}.{attr} after mocking {max_mocks} imports"
+        f"Failed to import {module_name}.{attr} after mocking {max_mocks} imports"
     )
 
 
 bench_latency = auto_mock("vllm.benchmarks", "latency")
 bench_serve = auto_mock("vllm.benchmarks", "serve")
 bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs")
+bench_sweep_plot_pareto = auto_mock(
+    "vllm.benchmarks.sweep.plot_pareto", "SweepPlotParetoArgs"
+)
 bench_sweep_serve = auto_mock("vllm.benchmarks.sweep.serve", "SweepServeArgs")
 bench_sweep_serve_sla = auto_mock(
     "vllm.benchmarks.sweep.serve_sla", "SweepServeSLAArgs"
@@ -91,21 +108,26 @@ def auto_mock(module, attr, max_mocks=100):
 CompleteCommand = auto_mock("vllm.entrypoints.cli.openai", "CompleteCommand")
 openai_cli_args = auto_mock("vllm.entrypoints.openai", "cli_args")
 openai_run_batch = auto_mock("vllm.entrypoints.openai", "run_batch")
-FlexibleArgumentParser = auto_mock(
-    "vllm.utils.argparse_utils", "FlexibleArgumentParser"
-)
+
+if TYPE_CHECKING:
+    from vllm.utils.argparse_utils import FlexibleArgumentParser
+else:
+    FlexibleArgumentParser = auto_mock(
+        "vllm.utils.argparse_utils", "FlexibleArgumentParser"
+    )
 
 
 class MarkdownFormatter(HelpFormatter):
     """Custom formatter that generates markdown for argument groups."""
 
-    def __init__(self, prog, starting_heading_level=3):
-        super().__init__(prog, max_help_position=float("inf"), width=float("inf"))
+    def __init__(self, prog: str, starting_heading_level: int = 3):
+        super().__init__(prog, max_help_position=sys.maxsize, width=sys.maxsize)
+
         self._section_heading_prefix = "#" * starting_heading_level
         self._argument_heading_prefix = "#" * (starting_heading_level + 1)
         self._markdown_output = []
 
-    def start_section(self, heading):
+    def start_section(self, heading: str):
         if heading not in {"positional arguments", "options"}:
             heading_md = f"\n{self._section_heading_prefix} {heading}\n\n"
             self._markdown_output.append(heading_md)
@@ -113,14 +135,14 @@ def start_section(self, heading):
     def end_section(self):
         pass
 
-    def add_text(self, text):
+    def add_text(self, text: str):
         if text:
             self._markdown_output.append(f"{text.strip()}\n\n")
 
     def add_usage(self, usage, actions, groups, prefix=None):
         pass
 
-    def add_arguments(self, actions):
+    def add_arguments(self, actions: Iterable[Action]):
         for action in actions:
             if len(action.option_strings) == 0 or "--help" in action.option_strings:
                 continue
@@ -169,7 +191,7 @@ def create_parser(add_cli_args, **kwargs) -> FlexibleArgumentParser:
         # Auto-mock runtime imports
         if tb_list := traceback.extract_tb(e.__traceback__):
             path = Path(tb_list[-1].filename).relative_to(ROOT_DIR)
-            auto_mock(module=".".join(path.parent.parts), attr=path.stem)
+            auto_mock(module_name=".".join(path.parent.parts), attr=path.stem)
             return create_parser(add_cli_args, **kwargs)
         else:
             raise e
@@ -202,6 +224,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
         "bench_latency": create_parser(bench_latency.add_cli_args),
         "bench_serve": create_parser(bench_serve.add_cli_args),
         "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args),
+        "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args),
         "bench_sweep_serve": create_parser(bench_sweep_serve.add_cli_args),
         "bench_sweep_serve_sla": create_parser(bench_sweep_serve_sla.add_cli_args),
         "bench_throughput": create_parser(bench_throughput.add_cli_args),
@@ -209,7 +232,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
 
     # Generate documentation for each parser
     for stem, parser in parsers.items():
-        doc_path = ARGPARSE_DOC_DIR / f"{stem}.md"
+        doc_path = ARGPARSE_DOC_DIR / f"{stem}.inc.md"
         # Specify encoding for building on Windows
         with open(doc_path, "w", encoding="utf-8") as f:
             f.write(super(type(parser), parser).format_help())
diff --git a/docs/models/hardware_supported_models/cpu.md b/docs/models/hardware_supported_models/cpu.md
new file mode 100644
index 000000000000..811778b2ad52
--- /dev/null
+++ b/docs/models/hardware_supported_models/cpu.md
@@ -0,0 +1,34 @@
+# CPU - Intel® Xeon®
+
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Xeon® 6 Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon.html)                   |
+| [Intel® Xeon® 5 Processors](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/5th-gen-xeon-scalable-processors.html)              |
+
+## Supported Models
+
+### Text-only Language Models
+
+| Model                                | Architecture                             | Supported |
+|--------------------------------------|-------------------------------------------|-----------|
+| meta-llama/Llama-3.1-8B-Instruct     | LlamaForCausalLM                          | ✅        |
+| meta-llama/Llama-3.2-3B-Instruct     | LlamaForCausalLM                          | ✅        |
+| ibm-granite/granite-3.2-2b-instruct  | GraniteForCausalLM                        | ✅        |
+| Qwen/Qwen3-1.7B                      | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-4B                        | Qwen3ForCausalLM                          | ✅        |
+| Qwen/Qwen3-8B                        | Qwen3ForCausalLM                          | ✅        |
+| zai-org/glm-4-9b-hf                  | GLMForCausalLM                            | ✅        |
+| google/gemma-7b                      | GemmaForCausalLM                          | ✅        |
+
+### Multimodal Language Models
+
+| Model                                | Architecture                             | Supported |
+|--------------------------------------|-------------------------------------------|-----------|
+| Qwen/Qwen2.5-VL-7B-Instruct          | Qwen2VLForConditionalGeneration           | ✅        |
+| openai/whisper-large-v3              | WhisperForConditionalGeneration           | ✅        |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  
diff --git a/docs/models/hardware_supported_models/tpu.md b/docs/models/hardware_supported_models/tpu.md
deleted file mode 100644
index 7b0a5ba6e72d..000000000000
--- a/docs/models/hardware_supported_models/tpu.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# TPU
-
-## Supported Models
-
-### Text-only Language Models
-
-| Model                                               | Architecture                   | Supported |
-|-----------------------------------------------------|--------------------------------|-----------|
-| mistralai/Mixtral-8x7B-Instruct-v0.1                | MixtralForCausalLM             | 🟨 |
-| mistralai/Mistral-Small-24B-Instruct-2501           | MistralForCausalLM             | ✅ |
-| mistralai/Codestral-22B-v0.1                        | MistralForCausalLM             | ✅ |
-| mistralai/Mixtral-8x22B-Instruct-v0.1               | MixtralForCausalLM             | ❌ |
-| meta-llama/Llama-3.3-70B-Instruct                   | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-3.1-8B-Instruct                    | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-3.1-70B-Instruct                   | LlamaForCausalLM               | ✅ |
-| meta-llama/Llama-4-*                                | Llama4ForConditionalGeneration | ❌ |
-| microsoft/Phi-3-mini-128k-instruct                  | Phi3ForCausalLM                | 🟨 |
-| microsoft/phi-4                                     | Phi3ForCausalLM                | ❌ |
-| google/gemma-3-27b-it                               | Gemma3ForConditionalGeneration | 🟨 |
-| google/gemma-3-4b-it                                | Gemma3ForConditionalGeneration | ❌ |
-| deepseek-ai/DeepSeek-R1                             | DeepseekV3ForCausalLM          | ❌ |
-| deepseek-ai/DeepSeek-V3                             | DeepseekV3ForCausalLM          | ❌ |
-| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8  | LlamaForCausalLM               | ✅ |
-| RedHatAI/Meta-Llama-3.1-70B-Instruct-quantized.w8a8 | LlamaForCausalLM               | ✅ |
-| Qwen/Qwen3-8B                                       | Qwen3ForCausalLM               | ✅ |
-| Qwen/Qwen3-32B                                      | Qwen3ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-7B-Instruct                            | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-32B                                    | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-14B-Instruct                           | Qwen2ForCausalLM               | ✅ |
-| Qwen/Qwen2.5-1.5B-Instruct                          | Qwen2ForCausalLM               | 🟨 |
-
-✅ Runs and optimized.  
-🟨 Runs and correct but not optimized to green yet.  
-❌ Does not pass accuracy test or does not run.  
diff --git a/docs/models/hardware_supported_models/xpu.md b/docs/models/hardware_supported_models/xpu.md
new file mode 100644
index 000000000000..7b8dcf5c9af2
--- /dev/null
+++ b/docs/models/hardware_supported_models/xpu.md
@@ -0,0 +1,65 @@
+# XPU - Intel® GPUs
+
+## Validated Hardware
+
+| Hardware                                 |
+| ----------------------------------------- |
+| [Intel® Arc™ Pro B-Series Graphics](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/workstations/b-series/overview.html)                   |
+
+## Supported Models
+
+### Text-only Language Models
+
+| Model                                     | Architecture                                         | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------------------------- | ---------------------------------------------------- | ---- | ----------- | ----- |
+| openai/gpt-oss-20b                        | GPTForCausalLM                                       |      |             | ✅     |
+| openai/gpt-oss-120b                       | GPTForCausalLM                                       |      |             | ✅     |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-8B  | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-14B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Qwen-32B  | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen2.5-72B-Instruct                 | Qwen2ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-14B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-32B                            | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B                        | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-30B-A3B-GPTQ-Int4              | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/Qwen3-coder-30B-A3B-Instruct         | Qwen3ForCausalLM                                     | ✅    | ✅           |       |
+| Qwen/QwQ-32B                              | QwenForCausalLM                                      | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-V2-Lite              | DeepSeekForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-3.1-8B-Instruct          | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| THUDM/GLM-4-9B-chat                       | GLMForCausalLM                                       | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| chuhac/TeleChat2-35B                      | LlamaForCausalLM (TeleChat2 based on Llama arch)     | ✅    | ✅           |       |
+| 01-ai/Yi1.5-34B-Chat                      | YiForCausalLM                                        | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| deepseek-ai/DeepSeek-Coder-33B-base       | DeepSeekCoderForCausalLM                             | ✅    | ✅           |       |
+| baichuan-inc/Baichuan2-13B-Chat           | BaichuanForCausalLM                                  | ✅    | ✅           |       |
+| meta-llama/Llama-2-13b-chat-hf            | LlamaForCausalLM                                     | ✅    | ✅           |       |
+| THUDM/CodeGeex4-All-9B                    | CodeGeexForCausalLM                                  | ✅    | ✅           |       |
+| Qwen/Qwen1.5-14B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+| Qwen/Qwen1.5-32B-Chat                     | QwenForCausalLM                                      | ✅    | ✅           |       |
+
+### Multimodal Language Models
+
+| Model                        | Architecture                     | FP16 | Dynamic FP8 | MXFP4 |
+| ---------------------------- | -------------------------------- | ---- | ----------- | ----- |
+| OpenGVLab/InternVL3_5-8B     | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-14B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| OpenGVLab/InternVL3_5-38B    | InternVLForConditionalGeneration | ✅    | ✅           |       |
+| Qwen/Qwen2-VL-7B-Instruct    | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-72B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| Qwen/Qwen2.5-VL-32B-Instruct | Qwen2VLForConditionalGeneration  | ✅    | ✅           |       |
+| THUDM/GLM-4v-9B              | GLM4vForConditionalGeneration    | ✅    | ✅           |       |
+| openbmb/MiniCPM-V-4          | MiniCPMVForConditionalGeneration | ✅    | ✅           |       |
+
+### Embedding and Reranker Language Models
+
+| Model                   | Architecture                   | FP16 | Dynamic FP8 | MXFP4 |
+| ----------------------- | ------------------------------ | ---- | ----------- | ----- |
+| Qwen/Qwen3-Embedding-8B | Qwen3ForTextEmbedding          | ✅    | ✅           |       |
+| Qwen/Qwen3-Reranker-8B  | Qwen3ForSequenceClassification | ✅    | ✅           |       |
+
+✅ Runs and optimized.  
+🟨 Runs and correct but not optimized to green yet.  
+❌ Does not pass accuracy test or does not run.  
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index 18bb645ea9a9..aca865f4bf77 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,15 +1,15 @@
 # Pooling Models
 
-vLLM also supports pooling models, such as embedding, classification and reward models.
+vLLM also supports pooling models, such as embedding, classification, and reward models.
 
 In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
 These models use a [Pooler][vllm.model_executor.layers.pooler.Pooler] to extract the final hidden states of the input
 before returning them.
 
 !!! note
-    We currently support pooling models primarily as a matter of convenience. This is not guaranteed to have any performance improvement over using HF Transformers / Sentence Transformers directly.
+    We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly.
 
-    We are now planning to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
+    We plan to optimize pooling models in vLLM. Please comment on <https://github.com/vllm-project/vllm/issues/21796> if you have any suggestions!
 
 ## Configuration
 
@@ -19,7 +19,7 @@ Run a model in pooling mode via the option `--runner pooling`.
 
 !!! tip
     There is no need to set this option in the vast majority of cases as vLLM can automatically
-    detect the model runner to use via `--runner auto`.
+    detect the appropriate model runner via `--runner auto`.
 
 ### Model Conversion
 
@@ -78,7 +78,7 @@ When loading [Sentence Transformers](https://huggingface.co/sentence-transformer
 its Sentence Transformers configuration file (`modules.json`) takes priority over the model's defaults.
 
 You can further customize this via the `--pooler-config` option,
-which takes priority over both the model's and Sentence Transformers's defaults.
+which takes priority over both the model's and Sentence Transformers' defaults.
 
 ## Offline Inference
 
@@ -168,11 +168,11 @@ The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
 
     - For embeddings, use `LLM.embed(...)` or `pooling_task="embed"`.
     - For classification logits, use `LLM.classify(...)` or `pooling_task="classify"`.
-    - For similarity scores, use `LLM.score(...)`.  
+    - For similarity scores, use `LLM.score(...)`.
     - For rewards, use `LLM.reward(...)` or `pooling_task="token_classify"`.
     - For token classification, use `pooling_task="token_classify"`.
-    - For multi-vector retrieval, use `pooling_task="token_embed"`
-    - For IO Processor Plugins , use `pooling_task="plugin"`
+    - For multi-vector retrieval, use `pooling_task="token_embed"`.
+    - For IO Processor Plugins, use `pooling_task="plugin"`.
 
 ```python
 from vllm import LLM
@@ -194,15 +194,15 @@ Our [OpenAI-Compatible Server](../serving/openai_compatible_server.md) provides
 - [Pooling API](../serving/openai_compatible_server.md#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
 
 !!! note
-    Please use one of the more specific methods or set the task directly when using  [Pooling API](../serving/openai_compatible_server.md#pooling-api) api.:
+    Please use one of the more specific endpoints or set the task directly when using the [Pooling API](../serving/openai_compatible_server.md#pooling-api):
 
     - For embeddings, use [Embeddings API](../serving/openai_compatible_server.md#embeddings-api) or `"task":"embed"`.
-    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `task":"classify"`.
-    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).  
-    - For rewards, `task":"token_classify"`.
-    - For token classification, use `task":"token_classify"`.
-    - For multi-vector retrieval, use `task":"token_embed"`
-    - For IO Processor Plugins , use `task":"plugin"`
+    - For classification logits, use [Classification API](../serving/openai_compatible_server.md#classification-api) or `"task":"classify"`.
+    - For similarity scores, use [Score API](../serving/openai_compatible_server.md#score-api).
+    - For rewards, use `"task":"token_classify"`.
+    - For token classification, use `"task":"token_classify"`.
+    - For multi-vector retrieval, use `"task":"token_embed"`.
+    - For IO Processor Plugins, use `"task":"plugin"`.
 
 ```python
 # start a supported embeddings model server with `vllm serve`, e.g.
@@ -232,7 +232,7 @@ for output in response.json()["data"]:
 
 ## Matryoshka Embeddings
 
-[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
+[Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows users to trade off between performance and cost.
 
 !!! warning
     Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
@@ -245,9 +245,9 @@ for output in response.json()["data"]:
 
 ### Manually enable Matryoshka Embeddings
 
-There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json,` it is allowed to change the output to arbitrary dimensions. Using `matryoshka_dimensions` can control the allowed output dimensions.
+There is currently no official interface for specifying support for Matryoshka Embeddings. In vLLM, if `is_matryoshka` is `True` in `config.json`, you can change the output dimension to arbitrary values. Use `matryoshka_dimensions` to control the allowed output dimensions.
 
-For models that support Matryoshka Embeddings but not recognized by vLLM, please manually override the config using `hf_overrides={"is_matryoshka": True}`, `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline) or `--hf-overrides '{"is_matryoshka": true}'`,  `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'`(online).
+For models that support Matryoshka Embeddings but are not recognized by vLLM, manually override the config using `hf_overrides={"is_matryoshka": True}` or `hf_overrides={"matryoshka_dimensions": [<allowed output dimensions>]}` (offline), or `--hf-overrides '{"is_matryoshka": true}'` or `--hf-overrides '{"matryoshka_dimensions": [<allowed output dimensions>]}'` (online).
 
 Here is an example to serve a model with Matryoshka Embeddings enabled.
 
@@ -278,7 +278,7 @@ A code example can be found here: [examples/offline_inference/pooling/embed_matr
 
 ### Online Inference
 
-Use the following command to start vllm server.
+Use the following command to start the vLLM server.
 
 ```bash
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
@@ -310,11 +310,11 @@ An OpenAI client example can be found here: [examples/online_serving/pooling/ope
 
 ### Encode task
 
-We have split the `encode` task into two more specific token wise tasks: `token_embed` and `token_classify`:
+We have split the `encode` task into two more specific token-wise tasks: `token_embed` and `token_classify`:
 
-- `token_embed` is the same as embed, using normalize as activation.
-- `token_classify` is the same as classify, default using softmax as activation.
+- `token_embed` is the same as `embed`, using normalization as the activation.
+- `token_classify` is the same as `classify`, by default using softmax as the activation.
 
 ### Remove softmax from PoolingParams
 
-We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, you should set `use_activation`, since we actually allow `classify` and `token_classify` to use any activation function.
+We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function.
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index c1eb207efcd1..da7c5edf66bf 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -15,9 +15,9 @@ These models are what we list in [supported text models](#list-of-text-only-lang
 
 ### Transformers
 
-vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers backend".
+vLLM also supports model implementations that are available in Transformers. You should expect the performance of a Transformers model implementation used in vLLM to be within <5% of the performance of a dedicated vLLM model implementation. We call this feature the "Transformers modeling backend".
 
-Currently, the Transformers backend works for the following:
+Currently, the Transformers modeling backend works for the following:
 
 - Modalities: embedding models, language models and vision-language models*
 - Architectures: encoder-only, decoder-only, mixture-of-experts
@@ -25,7 +25,7 @@ Currently, the Transformers backend works for the following:
 
 _*Vision-language models currently accept only image inputs. Support for video inputs will be added in a future release._
 
-If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers backend, it will be compatible with the following features of vLLM:
+If the Transformers model implementation follows all the steps in [writing a custom model](#writing-custom-models) then, when used with the Transformers modeling backend, it will be compatible with the following features of vLLM:
 
 - All the features listed in the [compatibility matrix](../features/README.md#feature-x-feature)
 - Any combination of the following vLLM parallelisation schemes:
@@ -44,7 +44,7 @@ llm.apply_model(lambda model: print(type(model)))
 
 If the printed type starts with `Transformers...` then it's using the Transformers model implementation!
 
-If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
+If a model has a vLLM implementation but you would prefer to use the Transformers implementation via the Transformers modeling backend, set `model_impl="transformers"` for [offline inference](../serving/offline_inference.md) or `--model-impl transformers` for the [online serving](../serving/openai_compatible_server.md).
 
 !!! note
     For vision-language models, if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance.
@@ -53,12 +53,12 @@ If a model has a vLLM implementation but you would prefer to use the Transformer
 
 If a model is neither supported natively by vLLM nor Transformers, it can still be used in vLLM!
 
-For a model to be compatible with the Transformers backend for vLLM it must:
+For a model to be compatible with the Transformers modeling backend for vLLM it must:
 
 - be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
     - The model directory must have the correct structure (e.g. `config.json` is present).
     - `config.json` must contain `auto_map.AutoModel`.
-- be a Transformers backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
+- be a Transformers modeling backend for vLLM compatible model (see [Writing custom models](#writing-custom-models)):
     - Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
 
 If the compatible model is:
@@ -66,20 +66,22 @@ If the compatible model is:
 - on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference](../serving/offline_inference.md) or `--trust-remote-code` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 - in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference](../serving/offline_inference.md) or `vllm serve <MODEL_DIR>` for the [openai-compatible-server](../serving/openai_compatible_server.md).
 
-This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
+This means that, with the Transformers modeling backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
 
 #### Writing custom models
 
-This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers modeling backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
 
-To make your model compatible with the Transformers backend, it needs:
+To make your model compatible with the Transformers modeling backend, it needs:
 
 1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
     - If your model is encoder-only:
         1. Add `is_causal = False` to `MyAttention`.
     - If your model is mixture-of-experts (MoE):
         1. Your sparse MoE block must have an attribute called `experts`.
-        2. The class of `experts` (`MyExperts`) must inherit from `nn.ModuleList`.
+        2. The class of `experts` (`MyExperts`) must either:
+            - Inherit from `nn.ModuleList` (naive).
+            - Or contain all 3D `nn.Parameters` (packed).
         3. `MyExperts.forward` must accept `hidden_states`, `top_k_index`, `top_k_weights`.
 2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
 3. `MyModel` must contain `_supports_attention_backend = True`.
@@ -134,7 +136,7 @@ Here is what happens in the background when this model is loaded:
 
 1. The config is loaded.
 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into one of the Transformers backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+3. `MyModel` is loaded into one of the Transformers modeling backend classes in [vllm/model_executor/models/transformers](../../vllm/model_executor/models/transformers) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
 
 That's it!
 
@@ -182,7 +184,7 @@ To determine whether a given model is natively supported, you can check the `con
 If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
 
 Models do not _need_ to be natively supported to be used in vLLM.
-The [Transformers backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+The [Transformers modeling backend](#transformers) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
 
 !!! tip
     The easiest way to check if your model is really supported at runtime is to run the program below:
@@ -351,6 +353,7 @@ th {
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
+| `AfmoeForCausalLM` | Afmoe | TBA | ✅︎ | ✅︎ |
 | `ApertusForCausalLM` | Apertus | `swiss-ai/Apertus-8B-2509`, `swiss-ai/Apertus-70B-Instruct-2509`, etc. | ✅︎ | ✅︎ |
 | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ |
 | `ArceeForCausalLM` | Arcee (AFM) | `arcee-ai/AFM-4.5B-Base`, etc. | ✅︎ | ✅︎ |
@@ -421,7 +424,7 @@ th {
 | `NemotronHForCausalLM` | Nemotron-H | `nvidia/Nemotron-H-8B-Base-8K`, `nvidia/Nemotron-H-47B-Base-8K`, `nvidia/Nemotron-H-56B-Base-8K`, etc. | ✅︎ | ✅︎ |
 | `OLMoForCausalLM` | OLMo | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. | ✅︎ | ✅︎ |
 | `OLMo2ForCausalLM` | OLMo2 | `allenai/OLMo-2-0425-1B`, etc. | ✅︎ | ✅︎ |
-| `OLMo3ForCausalLM` | OLMo3 | TBA | ✅︎ | ✅︎ |
+| `OLMo3ForCausalLM` | OLMo3 | `allenai/Olmo-3-7B-Instruct`, `allenai/Olmo-3-32B-Think`, etc. | ✅︎ | ✅︎ |
 | `OLMoEForCausalLM` | OLMoE | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. | | ✅︎ |
 | `OPTForCausalLM` | OPT, OPT-IML | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. | ✅︎ | ✅︎ |
 | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ |
@@ -433,6 +436,7 @@ th {
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ |
 | `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ |
+| `Plamo3ForCausalLM` | PLaMo3 | `pfnet/plamo-3-nict-2b-base`, `pfnet/plamo-3-nict-8b-base`, etc. | | ✅︎ |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ |
@@ -451,7 +455,7 @@ th {
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | |
 | `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ | ✅︎ |
 
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
@@ -475,6 +479,7 @@ These models primarily support the [`LLM.embed`](./pooling_models.md#llmembed) A
 | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|-------------------|----------------------|---------------------------|
 | `BertModel`<sup>C</sup> | BERT-based | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc. | | |
+| `BertSpladeSparseEmbeddingModel` | SPLADE | `naver/splade-v3` | | |
 | `Gemma2Model`<sup>C</sup> | Gemma 2-based | `BAAI/bge-multilingual-gemma2`, etc. | ✅︎ | ✅︎ |
 | `Gemma3TextModel`<sup>C</sup> | Gemma 3-based | `google/embeddinggemma-300m`, etc. | ✅︎ | ✅︎ |
 | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ |
@@ -669,13 +674,14 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I<sup>+</sup> | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ |
 | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I<sup>+</sup>/ V<sup>+</sup> | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ |
 | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ |
-| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
+| `Gemma3ForConditionalGeneration` | Gemma 3 | T + I<sup>E+</sup> | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ |
 | `Gemma3nForConditionalGeneration` | Gemma 3n | T + I + A | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GLM4VForCausalLM`<sup>^</sup> | GLM-4V | T + I | `zai-org/glm-4v-9b`, `zai-org/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ |
 | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
 | `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
+| `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
@@ -684,7 +690,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `KeyeVL1_5ForConditionalGeneration` | Keye-VL-1_5-8B | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-1_5-8B` | ✅︎ | ✅︎ |
 | `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | ✅︎ |
 | `LightOnOCRForConditionalGeneration`  | LightOnOCR-1B  | T + I<sup>+</sup> | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ |
-| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ |
+| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ |
 | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + I<sup>E+</sup> | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ |
 | `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ |
 | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + I<sup>E+</sup> | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ |
@@ -697,6 +703,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
+| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
 | `Ovis` | Ovis2, Ovis1.6 | T + I<sup>+</sup> | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc. | | ✅︎ |
 | `Ovis2_5` | Ovis2.5 | T + I<sup>+</sup> + V | `AIDC-AI/Ovis2.5-9B`, etc. | | |
 | `PaddleOCRVLForConditionalGeneration` | Paddle-OCR | T + I<sup>+</sup> | `PaddlePaddle/PaddleOCR-VL`, etc. | | |
@@ -719,8 +726,9 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Step3VLForConditionalGeneration` | Step3-VL | T + I<sup>+</sup> | `stepfun-ai/step3` | | ✅︎ |
 | `TarsierForConditionalGeneration` | Tarsier | T + I<sup>E+</sup> | `omni-search/Tarsier-7b`, `omni-search/Tarsier-34b` | | ✅︎ |
 | `Tarsier2ForConditionalGeneration`<sup>^</sup> | Tarsier2 | T + I<sup>E+</sup> + V<sup>E+</sup> | `omni-research/Tarsier2-Recap-7b`, `omni-research/Tarsier2-7b-0115` | | ✅︎ |
+| `UltravoxModel` | Ultravox | T + A<sup>E+</sup> | `fixie-ai/ultravox-v0_5-llama-3_2-1b` | ✅︎ | ✅︎ |
 
-Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
+Some models are supported only via the [Transformers modeling backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers modeling backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!
 
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|-----------------------------|-----------------------------------------|
@@ -785,6 +793,9 @@ Speech2Text models trained specifically for Automatic Speech Recognition.
 | `Gemma3nForConditionalGeneration` | Gemma3n | `google/gemma-3n-E2B-it`, `google/gemma-3n-E4B-it`, etc. | | |
 | `GraniteSpeechForConditionalGeneration` | Granite Speech | `ibm-granite/granite-speech-3.3-2b`, `ibm-granite/granite-speech-3.3-8b`, etc. | ✅︎ | ✅︎ |
 
+!!! note
+    `VoxtralForConditionalGeneration` requires `mistral-common[audio]` to be installed.
+
 ### Pooling Models
 
 See [this page](./pooling_models.md) for more information on how to use pooling models.
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
index 821628e6e317..ac98efb7b88a 100644
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -49,7 +49,8 @@ We currently support the following OpenAI APIs:
     - *Note: `suffix` parameter is not supported.*
 - [Chat Completions API](#chat-api) (`/v1/chat/completions`)
     - Only applicable to [text generation models](../models/generative_models.md) with a [chat template](../serving/openai_compatible_server.md#chat-template).
-    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+    - *Note: `user` parameter is ignored.*
+    - *Note:* Setting the `parallel_tool_calls` parameter to `false` ensures vLLM only returns zero or one tool call per request. Setting it to `true` (the default) allows returning more than one tool call per request. There is no guarantee more than one tool call will be returned if this is set to `true`, as that behavior is model dependent and not all models are designed to support parallel tool calls.
 - [Embeddings API](#embeddings-api) (`/v1/embeddings`)
     - Only applicable to [embedding models](../models/pooling_models.md).
 - [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
@@ -293,7 +294,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
             base_url="http://localhost:8000/v1",
             api_key="EMPTY",
         )
-        image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+        image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
         response = create_chat_embeddings(
             client,
@@ -350,7 +351,7 @@ The following extra parameters are supported by default:
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:embedding-extra-params"
     ```
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
@@ -358,7 +359,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s
 ??? code
 
     ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
+    --8<-- "vllm/entrypoints/pooling/embed/protocol.py:chat-embedding-extra-params"
     ```
 
 ### Transcriptions API
@@ -628,7 +629,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
+--8<-- "vllm/entrypoints/pooling/classify/protocol.py:classification-extra-params"
 ```
 
 ### Score API
@@ -833,7 +834,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```
 
 ### Re-rank API
@@ -914,7 +915,7 @@ The following [pooling parameters][vllm.PoolingParams] are supported.
 The following extra parameters are supported:
 
 ```python
---8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
+--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
 ```
 
 ## Ray Serve LLM
diff --git a/docs/serving/parallelism_scaling.md b/docs/serving/parallelism_scaling.md
index 14cd3b057791..a32840ea73b9 100644
--- a/docs/serving/parallelism_scaling.md
+++ b/docs/serving/parallelism_scaling.md
@@ -118,14 +118,16 @@ The common practice is to set the tensor parallel size to the number of GPUs in
 ```bash
 vllm serve /path/to/the/model/in/the/container \
     --tensor-parallel-size 8 \
-    --pipeline-parallel-size 2
+    --pipeline-parallel-size 2 \
+    --distributed-executor-backend ray
 ```
 
 Alternatively, you can set `tensor_parallel_size` to the total number of GPUs in the cluster:
 
 ```bash
 vllm serve /path/to/the/model/in/the/container \
-     --tensor-parallel-size 16
+     --tensor-parallel-size 16 \
+     --distributed-executor-backend ray
 ```
 
 ## Optimizing network communication for tensor parallelism
diff --git a/docs/usage/README.md b/docs/usage/README.md
index 0c63d01f0f99..4e8ece2c0605 100644
--- a/docs/usage/README.md
+++ b/docs/usage/README.md
@@ -1,6 +1,6 @@
 # Using vLLM
 
-First, vLLM must be [installed](../getting_started/installation/) for your chosen device in either a Python or Docker environment.
+First, vLLM must be [installed](../getting_started/installation/README.md) for your chosen device in either a Python or Docker environment.
 
 Then, vLLM supports the following usage patterns:
 
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
index d8a1943209c1..a8e49d0a3398 100644
--- a/docs/usage/reproducibility.md
+++ b/docs/usage/reproducibility.md
@@ -1,24 +1,23 @@
 # Reproducibility
 
-vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
+vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. To achieve
 reproducible results:
 
-- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
-- For V0: Set the global seed (see below).
+- In offline mode, you can either set `VLLM_ENABLE_V1_MULTIPROCESSING=0` which makes scheduling deterministic,
+  or enable [batch invariance](../features/batch_invariance.md) to make the outputs insensitive to scheduling.
+- In online mode, you can only enable [batch invariance](../features/batch_invariance.md).
 
 Example: [examples/offline_inference/reproducibility.py](../../examples/offline_inference/reproducibility.py)
 
 !!! warning
 
-    Applying the above settings [changes the random state in user code](#locality-of-random-state).
+    Setting `VLLM_ENABLE_V1_MULTIPROCESSING=0` will change the random state of user code 
+    (i.e. the code that constructs [LLM][vllm.LLM] class).
 
 !!! note
 
     Even with the above settings, vLLM only provides reproducibility
     when it runs on the same hardware and the same vLLM version.
-    Also, the online serving API (`vllm serve`) does not support reproducibility
-    because it is almost impossible to make the scheduling deterministic in the
-    online setting.
 
 ## Setting the global seed
 
@@ -26,27 +25,17 @@ The `seed` parameter in vLLM is used to control the random states for various ra
 
 If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly.
 
-However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state).
-
 ### Default Behavior
 
-In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
-
 In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
 
-!!! note
-
-    It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
-    for workflows such as speculative decoding.
-    
-    For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
-
-### Locality of random state
+It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
+for workflows such as speculative decoding. For more information, see: <https://github.com/vllm-project/vllm/pull/17929>
 
-The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
+!!! note
 
-- For V0: The seed is specified.
-- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+    The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM 
+    only if the workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
 
-By default, these conditions are not active so you can use vLLM without having to worry about
-accidentally making deterministic subsequent operations that rely on random state.
+    By default, `VLLM_ENABLE_V1_MULTIPROCESSING=1` so you can use vLLM without having to worry about
+    accidentally making deterministic subsequent operations that rely on random state.
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
index 8d8a9e0f5080..5f647aafd61d 100644
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -2,11 +2,9 @@
 
 !!! announcement
 
-    We have started the process of deprecating V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
+    We have fully deprecated V0. Please read [RFC #18571](https://github.com/vllm-project/vllm/issues/18571) for more details.
 
-V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
-
-## Why vLLM V1?
+    If you have a use case that works on V0 Engine but not V1, please share it on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
 vLLM V0 successfully supported a wide range of models and hardware, but as new features were developed independently, the system grew increasingly complex. This complexity made it harder to integrate new capabilities and introduced technical debt, revealing the need for a more streamlined and unified design.
 
@@ -32,16 +30,44 @@ Upgrade to vLLM’s Core Architecture](https://blog.vllm.ai/2025/01/27/v1-alpha-
 
 This living user guide outlines a few known **important changes and limitations** introduced by vLLM V1. The team has been working actively to bring V1 as the default engine, therefore this guide will be updated constantly as more features get supported on vLLM V1.
 
-## Current Status
+## Differences from V0
+
+This section lists some differences in behavior between V0 and V1.
+
+### Chunked Prefill
+
+Chunked prefill is enabled by default whenever possible, unlike in V0 where it was conditionally enabled based on model characteristics.
+
+### CUDA Graphs
+
+CUDA graph capture takes up more memory in V1 than in V0.
+
+### Semantic Changes to Logprobs
+
+#### Logprobs Calculation
+
+By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
+before applying any logits post-processing such as temperature scaling or penalty
+adjustments). As a result, the returned logprobs do not reflect the final adjusted
+probabilities used during sampling.
+
+You can adjust this behavior by setting the `--logprobs-mode` flag.
+Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
+Raw means the values before applying any logit processors, like bad words.
+Processed means the values after applying all processors, including temperature and top_k/top_p.
+
+#### Prompt Logprobs with Prefix Caching
+
+While V1 supports passing prompt logprobs with prefix caching enabled, it no longer caches the logprobs.
+For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
+
+## Feature Support
 
-For each item, our progress towards V1 support falls into one of the following states:
+For each item, its support in vLLM V1 falls into one of the following states:
 
-- **🚀 Optimized**: Nearly fully optimized, with no further work currently planned.
-- **🟢 Functional**: Fully operational, with ongoing optimizations.
-- **🚧 WIP**: Under active development.
-- **🟡 Planned**: Scheduled for future implementation (some may have open PRs/RFCs).
-- **🟠 Delayed**: Temporarily dropped in V1 but planned to be re-introduced later.
-- **🔴 Deprecated**: Not planned for V1 unless there is strong demand.
+- **🟢 Functional**: Fully operational with optimizations comparable to or better than V0.
+- **🟡 In Progress**: Planned to be in vLLM V1, with open PRs/RFCs.
+- **🔴 Removed**: Dropped from vLLM V1. Will only consider re-introducing if there is strong demand.
 
 !!! note
     vLLM V1’s unified scheduler treats both prompt and output tokens the same
@@ -57,13 +83,13 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
 ### Hardware
 
-| Hardware   | Status                                        |
-|------------|-----------------------------------------------|
-| **NVIDIA** | <nobr>🚀</nobr>                               |
-| **AMD**    | <nobr>🟢</nobr>                               |
+| Hardware         | Status                                        |
+|------------------|-----------------------------------------------|
+| **NVIDIA**       | <nobr>🟢</nobr>                               |
+| **AMD**          | <nobr>🟢</nobr>                               |
 | **INTEL GPU**    | <nobr>🟢</nobr>                               |
-| **TPU**    | <nobr>🟢</nobr>                               |
-| **CPU**    | <nobr>🟢 (x86\_64/aarch64) 🟡 (MacOS) </nobr> |
+| **TPU**          | <nobr>🟢</nobr>                               |
+| **CPU**          | <nobr>🟢</nobr>                               |
 
 !!! note
 
@@ -78,23 +104,21 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 
 ### Models
 
-| Model Type                  | Status                                                                             |
-|-----------------------------|------------------------------------------------------------------------------------|
-| **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
-| **Encoder-Decoder Models**  | <nobr>🟢 Whisper only</nobr>                                                       |
-| **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
-| **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
-| **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
+| Model Type                  | Status                                                                  |
+|-----------------------------|-------------------------------------------------------------------------|
+| **Decoder-only Models**     | <nobr>🟢</nobr>                                                         |
+| **Encoder-Decoder Models**  | <nobr>🟢 (Whisper), 🔴 (Others) </nobr>                                |
+| **Pooling Models**          | <nobr>🟢</nobr>                                                         |
+| **Mamba Models**            | <nobr>🟢</nobr>                                                         |
+| **Multimodal Models**       | <nobr>🟢</nobr>                                                         |
 
 See below for the status of models that are not yet supported or have more features planned in V1.
 
-#### Embedding Models
+#### Pooling Models
 
-The initial basic support is now functional.
+Now fully supported, with prefix caching and chunked prefill newly available for last-pooling models.
 
-Later, we will consider using [hidden states processor](https://github.com/vllm-project/vllm/issues/12249),
-which is based on [global logits processor](https://github.com/vllm-project/vllm/pull/13360)
-to enable simultaneous generation and embedding using the same engine instance in V1.
+We are working on enabling prefix caching and chunked prefill for more categories of pooling models.
 
 #### Mamba Models
 
@@ -112,24 +136,25 @@ Please note that prefix caching is not yet supported for any of the above models
 
 Whisper is supported. Other models requiring cross-attention between separate
 encoder and decoder (e.g., `BartForConditionalGeneration`,
-`MllamaForConditionalGeneration`) are not supported.
+`MllamaForConditionalGeneration`) are no longer supported.
 
 ### Features
 
 | Feature                                     | Status                                                                            |
 |---------------------------------------------|-----------------------------------------------------------------------------------|
-| **Prefix Caching**                          | <nobr>🚀 Optimized</nobr>                                                         |
-| **Chunked Prefill**                         | <nobr>🚀 Optimized</nobr>                                                         |
-| **LoRA**                                    | <nobr>🚀 Optimized</nobr>                                                         |
+| **Prefix Caching**                          | <nobr>🟢 Functional</nobr>                                                        |
+| **Chunked Prefill**                         | <nobr>🟢 Functional</nobr>                                                        |
+| **LoRA**                                    | <nobr>🟢 Functional</nobr>                                                        |
 | **Logprobs Calculation**                    | <nobr>🟢 Functional</nobr>                                                        |
-| **FP8 KV Cache**                            | <nobr>🟢 Functional on Hopper devices (<https://github.com/vllm-project/vllm/pull/15191>)</nobr>|
-| **Spec Decode**                             | <nobr>🚀 Optimized</nobr>                                                         |
-| **Prompt Logprobs with Prefix Caching**     | <nobr>🟡 Planned ([RFC #13414](https://github.com/vllm-project/vllm/issues/13414))</nobr>|
+| **FP8 KV Cache**                            | <nobr>🟢 Functional</nobr>                                                        |
+| **Spec Decode**                             | <nobr>🟢 Functional</nobr>                                                        |
+| **Prompt Logprobs with Prefix Caching**     | <nobr>🟢 Functional</nobr>                                                        |
 | **Structured Output Alternative Backends**  | <nobr>🟢 Functional</nobr>                                                        |
-| **Request-level Structured Output Backend** | <nobr>🔴 Deprecated</nobr>                                                        |
-| **best_of**                                 | <nobr>🔴 Deprecated ([RFC #13361](https://github.com/vllm-project/vllm/issues/13361))</nobr>|
-| **Per-Request Logits Processors**           | <nobr>🔴 Deprecated ([RFC #13360](https://github.com/vllm-project/vllm/pull/13360))</nobr> |
-| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Deprecated</nobr>                                                        |
+| **Concurrent Partial Prefills**             | <nobr>🟡 [In Progress](https://github.com/vllm-project/vllm/issues/14003)</nobr>  |
+| **best_of**                                 | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/issues/13361)</nobr>      |
+| **Per-Request Logits Processors**           | <nobr>🔴 [Removed](https://github.com/vllm-project/vllm/pull/13360)</nobr>        |
+| **GPU <> CPU KV Cache Swapping**            | <nobr>🔴 Removed</nobr>                                                           |
+| **Request-level Structured Output Backend** | <nobr>🔴 Removed</nobr>                                                           |
 
 !!! note
 
@@ -139,38 +164,17 @@ encoder and decoder (e.g., `BartForConditionalGeneration`,
     prefix caching, and speculative decoding without a strict separation between prefill
     and decode phases.
 
-#### Semantic Changes to Logprobs
-
-vLLM V1 supports logprobs and prompt logprobs. However, there are some important semantic
-differences compared to V0:
-
-##### Logprobs Calculation
-
-By default, logprobs in V1 are now returned immediately once computed from the model’s raw output (i.e.
-before applying any logits post-processing such as temperature scaling or penalty
-adjustments). As a result, the returned logprobs do not reflect the final adjusted
-probabilities used during sampling.
-
-You can adjust this behavior by setting the `--logprobs-mode` flag.
-Four modes are supported: `raw_logprobs` (default), `processed_logprobs`, `raw_logits`, `processed_logits`.
-Raw means the values before applying any logit processors, like bad words.
-Processed means the values after applying all processors, including temperature and top_k/top_p.
-
-##### Prompt Logprobs with Prefix Caching
-
-Logprobs are not cached. For a request requiring prompt logprobs, the engine will ignore the prefix cache and recompute the prefill of full prompt to generate the logprobs.
-
-#### Deprecated Features
+#### Removed Features
 
-As part of the major architectural rework in vLLM V1, several legacy features have been deprecated.
+As part of the major architectural rework in vLLM V1, several legacy features have been removed.
 
 ##### Sampling features
 
-- **best_of**: This feature has been deprecated due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
+- **best_of**: This feature has been removed due to limited usage. See details at [RFC #13361](https://github.com/vllm-project/vllm/issues/13361).
 - **Per-Request Logits Processors**: In V0, users could pass custom
   processing functions to adjust logits on a per-request basis. In vLLM V1, this
-  feature has been deprecated. Instead, the design is moving toward supporting **global logits
-  processors**, a feature the team is actively working on for future releases. See details at [RFC #13360](https://github.com/vllm-project/vllm/pull/13360).
+  feature has been removed. Instead, we now support **global logits processors**
+  which are set at startup time, see [RFC #17799](https://github.com/vllm-project/vllm/issues/17799).
 
 ##### KV Cache features
 
@@ -179,4 +183,4 @@ to handle request preemptions.
 
 ##### Structured Output features
 
-- **Request-level Structured Output Backend**: Deprecated, alternative backends (outlines, guidance) with fallbacks is supported now.
+- **Request-level Structured Output Backend**: Removed; alternative backends (outlines, guidance) with fallbacks are supported now.
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
old mode 100644
new mode 100755
index 53d69bbdbdc7..df6e96ca375f
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -43,6 +43,7 @@ class ModelRequestData(NamedTuple):
 
 
 # Voxtral
+# Make sure to install mistral-common[audio].
 def run_voxtral(question: str, audio_count: int) -> ModelRequestData:
     from mistral_common.audio import Audio
     from mistral_common.protocol.instruct.chunk import (
@@ -424,6 +425,13 @@ def parse_args():
         default=None,
         help="Set the seed when initializing `vllm.LLM`.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
 
     return parser.parse_args()
 
@@ -433,6 +441,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     audio_count = args.num_audios
     req_data = model_example_map[model](
         question_per_audio_count[audio_count], audio_count
@@ -445,6 +459,8 @@ def main(args):
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # We set temperature to 0.2 so that outputs can be different
diff --git a/examples/offline_inference/context_extension.py b/examples/offline_inference/context_extension.py
index df39e4c25d5c..67d33e1881ee 100644
--- a/examples/offline_inference/context_extension.py
+++ b/examples/offline_inference/context_extension.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This script demonstrates how to extend the context length
-of a Qwen model using the YARN method (rope_scaling)
+of a Qwen model using the YARN method (rope_parameters)
 and run a simple chat example.
 
 Usage:
@@ -19,8 +19,8 @@ def create_llm():
 
     # Use yarn to extend context
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index 6c23cf342e06..5e5da2c0144c 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -46,7 +46,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora", 1, lora_path),
         ),
@@ -57,7 +56,6 @@ def create_test_prompts(
                 logprobs=1,
                 prompt_logprobs=1,
                 max_tokens=128,
-                stop_token_ids=[32003],
             ),
             LoRARequest("sql-lora2", 2, lora_path),
         ),
@@ -98,7 +96,7 @@ def initialize_engine() -> LLMEngine:
     #   use the same rank, it is recommended to set this as low as possible.
     # max_cpu_loras: controls the size of the CPU LoRA cache.
     engine_args = EngineArgs(
-        model="meta-llama/Llama-2-7b-hf",
+        model="meta-llama/Llama-3.2-3B-Instruct",
         enable_lora=True,
         max_loras=1,
         max_lora_rank=8,
@@ -111,7 +109,7 @@ def initialize_engine() -> LLMEngine:
 def main():
     """Main function that sets up and runs the prompt processing."""
     engine = initialize_engine()
-    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+    lora_path = snapshot_download(repo_id="jeeejeee/llama32-3b-text2sql-spider")
     test_prompts = create_test_prompts(lora_path)
     process_requests(engine, test_prompts)
 
diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md
deleted file mode 100644
index 8c9c1c92b676..000000000000
--- a/examples/offline_inference/profiling_tpu/README.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# vLLM TPU Profiling
-
-This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
-
-Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
-
-We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html).
-
-> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
-
-## Profile Examples
-
-### Generate Prefill Trace
-
-This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
-
-```bash
-export XLA_HLO_DEBUG=1
-export MODEL=Qwen/Qwen2.5-7B-Instruct
-export VLLM_TPU_PROFILE_DURATION_MS=3000
-export VLLM_TPU_PROFILE_DELAY_MS=0
-
-python3 profiling.py \
-    --model $MODEL \
-    --input-len 1024 --output-len 1 \
-    --batch-size 1 --enforce-eager \
-    --max-model-len 2048 \
-    --tensor-parallel-size 1 \
-    --profile-result-dir profiles
-```
-
-### Generate Decode Trace
-
-This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
-
-```bash
-export XLA_HLO_DEBUG=1
-export MODEL=meta-llama/Llama-3.1-70B-Instruct
-export VLLM_TPU_PROFILE_DURATION_MS=2000
-export VLLM_TPU_PROFILE_DELAY_MS=1000
-
-rm -rf ~/.cache/vllm/xla_cache
-python3 profiling.py \
-    --model $MODEL \
-    --input-len 1 \
-    --output-len 128 \
-    --batch-size 32 \
-    --enforce-eager \
-    --profile-result-dir profiles \
-    --max-model-len 2048 --tensor-parallel-size 8
-```
-
-## Visualizing the profiles
-
-Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
-
-Here are most likely the dependencies you need to install:
-
-```bash
-pip install tensorflow-cpu \
-    tensorboard-plugin-profile \
-    etils \
-    importlib_resources
-```
-
-Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
-
-```bash
-tensorboard --logdir profiles/ --port 6006
-```
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
deleted file mode 100644
index 3b127e4fd29d..000000000000
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import dataclasses
-import os
-import time
-
-import numpy as np
-import torch_xla.debug.profiler as xp
-from tqdm import tqdm
-
-from vllm import LLM, SamplingParams
-from vllm.engine.arg_utils import EngineArgs
-from vllm.inputs import PromptType
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
-DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
-
-
-def main(args: argparse.Namespace):
-    print(args)
-
-    engine_args = EngineArgs.from_cli_args(args)
-    llm = LLM(**dataclasses.asdict(engine_args))
-    server = xp.start_server(9012)  # noqa: F841
-
-    sampling_params = SamplingParams(
-        temperature=0.0,
-        ignore_eos=True,
-        max_tokens=args.output_len,
-    )
-    print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(
-        10000, size=(args.batch_size, args.input_len)
-    )
-    dummy_prompts: list[PromptType] = [
-        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
-    ]
-
-    def run_to_completion():
-        start_time = time.perf_counter()
-        llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
-        end_time = time.perf_counter()
-        latency = end_time - start_time
-        return latency
-
-    # Warmup
-    print("Warming up...")
-    warmup_latencies = []
-    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        warmup_latencies.append(run_to_completion())
-    print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
-
-    # Profile
-    profile_dir = args.profile_result_dir
-    print(f"Profiling (results will be saved to '{profile_dir}')...")
-    # Enable tracing on server
-    xp.trace_detached(
-        "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS
-    )
-    if DELAY_MS == 0:
-        time.sleep(1.0)
-    profile_latencies = []
-    for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
-        profile_latencies.append(run_to_completion())
-    print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
-
-    return
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="Benchmark the latency of processing a single batch of "
-        "requests till completion."
-    )
-    parser.add_argument("--input-len", type=int, default=32)
-    parser.add_argument("--output-len", type=int, default=128)
-    parser.add_argument("--batch-size", type=int, default=8)
-    parser.add_argument(
-        "--num-iters-warmup",
-        type=int,
-        default=5,
-        help="Number of iterations to run for warmup.",
-    )
-    parser.add_argument(
-        "--num-iters",
-        type=int,
-        default=1,
-        help="Number of iterations to run for profiling.",
-    )
-    parser.add_argument(
-        "--profile-result-dir",
-        type=str,
-        default="profiles",
-        help=(
-            "path to save the pytorch profiler output. Can be visualized "
-            "with ui.perfetto.dev or Tensorboard "
-            "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)."
-        ),
-    )
-
-    parser = EngineArgs.add_cli_args(parser)
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py
new file mode 100644
index 000000000000..88a61ed694c2
--- /dev/null
+++ b/examples/offline_inference/qwen3_omni/only_thinker.py
@@ -0,0 +1,170 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference
+with the correct prompt format on Qwen2.5-Omni (thinker only).
+"""
+
+from typing import NamedTuple
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+
+class QueryResult(NamedTuple):
+    inputs: dict
+    limit_mm_per_prompt: dict[str, int]
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+default_system = (
+    "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
+    "Group, capable of perceiving auditory and visual inputs, as well as "
+    "generating text and speech."
+)
+
+
+def get_mixed_modalities_query() -> QueryResult:
+    question = (
+        "What is recited in the audio? "
+        "What is the content of this image? Why is this video funny?"
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|vision_start|><|image_pad|><|vision_end|>"
+        "<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image": convert_image_mode(
+                    ImageAsset("cherry_blossom").pil_image, "RGB"
+                ),
+                "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
+    )
+
+
+def get_use_audio_in_video_query() -> QueryResult:
+    question = (
+        "Describe the content of the video in details, then convert what the "
+        "baby say into text."
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_start|><|video_pad|><|vision_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    asset = VideoAsset(name="baby_reading", num_frames=16)
+    audio = asset.get_audio(sampling_rate=16000)
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "video": asset.np_ndarrays,
+                "audio": audio,
+            },
+            "mm_processor_kwargs": {
+                "use_audio_in_video": True,
+            },
+        },
+        limit_mm_per_prompt={"audio": 1, "video": 1},
+    )
+
+
+def get_multi_audios_query() -> QueryResult:
+    question = "Are these two audio clips the same?"
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_start|><|audio_pad|><|audio_end|>"
+        "<|audio_start|><|audio_pad|><|audio_end|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
+    return QueryResult(
+        inputs={
+            "prompt": prompt,
+            "multi_modal_data": {
+                "audio": [
+                    AudioAsset("winning_call").audio_and_sample_rate,
+                    AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                ],
+            },
+        },
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+    )
+
+
+query_map = {
+    "mixed_modalities": get_mixed_modalities_query,
+    "use_audio_in_video": get_use_audio_in_video_query,
+    "multi_audios": get_multi_audios_query,
+}
+
+
+def main(args):
+    model_name = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
+    query_result = query_map[args.query_type]()
+
+    llm = LLM(
+        model=model_name,
+        max_model_len=12800,
+        max_num_seqs=5,
+        limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+        seed=args.seed,
+    )
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=256)
+
+    outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="mixed_modalities",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index d909438b4104..72c1e841dca4 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -11,12 +11,11 @@
 
 from vllm import LLM, SamplingParams
 
-# V1 only: Turn off multiprocessing to make the scheduling deterministic.
+# Either:
+## Turn off multiprocessing to make the scheduling deterministic, or
 os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-
-# V0 only: Set the global seed. The default seed is None, which is
-# not reproducible.
-SEED = 42
+## Enable batch invariance to get consistent results regardless of scheduling.
+os.environ["VLLM_BATCH_INVARIANT"] = "1"
 
 prompts = [
     "Hello, my name is",
@@ -28,7 +27,7 @@
 
 
 def main():
-    llm = LLM(model="facebook/opt-125m", seed=SEED)
+    llm = LLM(model="facebook/opt-125m")
     outputs = llm.generate(prompts, sampling_params)
     print("-" * 50)
     for output in outputs:
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index 0c09e603271d..6f05968ce065 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -62,7 +62,7 @@ def __init__(self, *args, **kwargs):
 
 # Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
 # Learn more about Ray placement groups:
-# https://docs.ray.io/en/latest/placement-groups.html
+# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
 pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
 ray.get(pg_inference.ready())
 scheduling_inference = PlacementGroupSchedulingStrategy(
diff --git a/examples/offline_inference/rlhf_online_quant.py b/examples/offline_inference/rlhf_online_quant.py
new file mode 100644
index 000000000000..2d98ad22c589
--- /dev/null
+++ b/examples/offline_inference/rlhf_online_quant.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
+
+The script separates training and inference workloads onto distinct GPUs
+so that Ray can manage process placement and inter-process communication.
+A Hugging Face Transformer model occupies GPU 0 for training, whereas a
+tensor-parallel vLLM inference engine occupies GPU 1–2.
+
+The example performs the following steps:
+
+* Load the training model on GPU 0.
+* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
+  and Ray placement groups.
+* Generate text from a list of prompts using the inference engine.
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using a Ray collective RPC group. Note that
+  for demonstration purposes we simply zero out the weights.
+
+For a production-ready implementation that supports multiple training and
+inference replicas, see the OpenRLHF framework:
+https://github.com/OpenRLHF/OpenRLHF
+
+This example assumes a single-node cluster with three GPUs, but Ray
+supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
+workloads. Residual GPU activity interferes with vLLM memory profiling and
+causes unexpected behavior.
+"""
+
+import json
+import os
+
+import ray
+import torch
+from ray.util.placement_group import placement_group
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from rlhf_utils import stateless_init_process_group
+from torchao.core.config import config_to_dict
+from torchao.quantization import (
+    Float8DynamicActivationFloat8WeightConfig,
+    PerRow,
+)
+from transformers import AutoModelForCausalLM
+
+from vllm import LLM, SamplingParams
+from vllm.utils.network_utils import get_ip, get_open_port
+
+
+class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution."""
+
+    def __init__(self, *args, **kwargs):
+        # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
+        # so that vLLM can manage its own device placement within the worker.
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        super().__init__(*args, **kwargs)
+
+
+# Load the OPT-125M model onto GPU 0 for the training workload.
+train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
+train_model.to("cuda:0")
+
+# Initialize Ray and set the visible devices. The vLLM engine will
+# be placed on GPUs 1 and 2.
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
+ray.init()
+
+# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
+# Learn more about Ray placement groups:
+# https://docs.ray.io/en/latest/ray-core/scheduling/placement-group.html
+pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
+ray.get(pg_inference.ready())
+scheduling_inference = PlacementGroupSchedulingStrategy(
+    placement_group=pg_inference,
+    placement_group_capture_child_tasks=True,
+    placement_group_bundle_index=0,
+)
+
+# Launch the vLLM inference engine. The `enforce_eager` flag reduces
+# start-up latency.
+
+# generate torchao quantization config for RL rollout
+# see https://github.com/vllm-project/vllm/pull/23014 for instructions to
+# use serialized config files instead of passing around json string
+config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+
+json_str = json.dumps(config_to_dict(config))
+
+llm = ray.remote(
+    num_cpus=0,
+    num_gpus=0,
+    scheduling_strategy=scheduling_inference,
+)(MyLLM).remote(
+    model="facebook/opt-125m",
+    hf_overrides={"quantization_config_dict_json": json_str},
+    enforce_eager=True,
+    worker_extension_cls="rlhf_utils.WorkerExtension",
+    tensor_parallel_size=2,
+    distributed_executor_backend="ray",
+)
+
+# Generate text from the prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+sampling_params = SamplingParams(temperature=0)
+
+outputs = ray.get(llm.generate.remote(prompts, sampling_params))
+
+print("-" * 50)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
+
+# Set up the communication channel between the training process and the
+# inference engine.
+master_address = get_ip()
+master_port = get_open_port()
+
+handle = llm.collective_rpc.remote(
+    "init_weight_update_group", args=(master_address, master_port, 1, 3)
+)
+
+model_update_group = stateless_init_process_group(
+    master_address, master_port, 0, 3, torch.device("cuda:0")
+)
+ray.get(handle)
+
+# Simulate a training step by zeroing out all model weights.
+# In a real RLHF training loop the weights would be updated using the gradient
+# from an RL objective such as PPO on a reward model.
+for name, p in train_model.named_parameters():
+    p.data.zero_()
+
+# Synchronize the updated weights to the inference engine.
+for name, p in train_model.named_parameters():
+    dtype_name = str(p.dtype).split(".")[-1]
+    handle = llm.collective_rpc.remote(
+        "update_weight", args=(name, dtype_name, p.shape)
+    )
+    model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
+    ray.get(handle)
+
+# Verify that the inference weights have been updated.
+assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
+
+# Generate text with the updated model. The output is expected to be nonsense
+# because the weights are zero.
+outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
+print("-" * 50)
+for output in outputs_updated:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+    print("-" * 50)
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index 13def88439ef..5c0787b8778d 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -30,8 +30,8 @@ class WorkerExtension:
     """
     The class for vLLM's worker to inherit from.
     By defining an extension class, the code can work no matter what is
-    the underlying worker class. This way, the code can be compatible
-    with both vLLM V0 and V1.
+    the underlying worker class.
+
     NOTE: we define this class in a separate module, and the main module
     should pass the full qualified name as `worker_extension_cls` argument.
     """
@@ -96,8 +96,8 @@ class ColocateWorkerExtension:
     """
     The class for vLLM's worker to inherit from, in the colocate setting.
     By defining an extension class, the code can work no matter what is
-    the underlying worker class. This way, the code can be compatible
-    with both vLLM V0 and V1.
+    the underlying worker class.
+
     NOTE: we define this class in a separate module, and the main module
     should pass the full qualified name as `worker_extension_cls` argument.
     """
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index e25f46b126e6..88ee48b98bff 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -67,22 +67,9 @@ def main(args):
     Path(args.output).mkdir(exist_ok=True)
     # Dump worker states to output directory
 
-    # Check which engine version is being used
-    is_v1_engine = hasattr(llm.llm_engine, "engine_core")
-
-    if is_v1_engine:
-        # For V1 engine, we need to use engine_core.save_sharded_state
-        print("Using V1 engine save path")
-        llm.llm_engine.engine_core.save_sharded_state(
-            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
-        )
-    else:
-        # For V0 engine
-        print("Using V0 engine save path")
-        model_executor = llm.llm_engine.model_executor
-        model_executor.save_sharded_state(
-            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
-        )
+    llm.llm_engine.engine_core.save_sharded_state(
+        path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+    )
 
     # Copy metadata files to output directory
     for file in os.listdir(model_path):
diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py
index 3cdc3b245b72..29b2e95d262f 100644
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -133,7 +133,7 @@ def main(args):
         tensor_parallel_size=args.tp,
         enable_chunked_prefill=args.enable_chunked_prefill,
         enforce_eager=args.enforce_eager,
-        gpu_memory_utilization=0.8,
+        gpu_memory_utilization=0.9,
         speculative_config=speculative_config,
         disable_log_stats=False,
         max_model_len=args.max_model_len,
@@ -158,11 +158,7 @@ def main(args):
             print(f"generated text: {output.outputs[0].text}")
             print("-" * 50)
 
-    try:
-        metrics = llm.get_metrics()
-    except AssertionError:
-        print("Metrics are not supported in the V0 engine.")
-        return
+    metrics = llm.get_metrics()
 
     total_num_output_tokens = sum(
         len(output.outputs[0].token_ids) for output in outputs
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
deleted file mode 100644
index 0093b63b0b1f..000000000000
--- a/examples/offline_inference/tpu.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import argparse
-import os
-
-from vllm import LLM, SamplingParams
-
-prompts = [
-    "A robot may not injure a human being",
-    "It is only with the heart that one can see rightly;",
-    "The greatest glory in living lies not in never falling,",
-]
-answers = [
-    " or, through inaction, allow a human being to come to harm.",
-    " what is essential is invisible to the eye.",
-    " but in rising every time we fall.",
-]
-N = 1
-# Currently, top-p sampling is disabled. `top_p` should be 1.0.
-sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="TPU offline inference example")
-    parser.add_argument("--use-spmd", action="store_true", help="Enable SPMD mode")
-    args = parser.parse_args()
-
-    llm_args = {
-        "model": "Qwen/Qwen2-1.5B-Instruct",
-        "max_num_batched_tokens": 64,
-        "max_num_seqs": 4,
-        "max_model_len": 128,
-    }
-    if args.use_spmd:
-        os.environ["VLLM_XLA_USE_SPMD"] = "1"
-        # Can only hardcode the number of chips for now.
-        # calling xr.global_runtime_device_count() beforeing init SPMD env in
-        # torch_xla will mess up the distributed env.
-        llm_args["tensor_parallel_size"] = 8
-        # Use Llama, for num_kv_heads = 8.
-        llm_args["model"] = "meta-llama/Llama-3.1-8B-Instruct"
-
-    # Set `enforce_eager=True` to avoid ahead-of-time compilation.
-    # In real workloads, `enforce_eager` should be `False`.
-    llm = LLM(**llm_args)
-    outputs = llm.generate(prompts, sampling_params)
-    print("-" * 50)
-    for output, answer in zip(outputs, answers):
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        assert generated_text.startswith(answer)
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
old mode 100644
new mode 100755
index 371cf6309a67..8f72bf6f0b0d
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -538,6 +538,31 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# HunyuanOCR
+def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
+    assert modality == "image"
+
+    model_name = "tencent/HunyuanOCR"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+    prompts = [
+        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=None,
+    )
+
+
 # naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
 def run_hyperclovax_seed_vision(
     questions: list[str], modality: str
@@ -1536,7 +1561,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
         mm_processor_kwargs={
             "min_pixels": 28 * 28,
             "max_pixels": 1280 * 28 * 28,
-            "fps": [1],
+            "fps": 1,
         },
         limit_mm_per_prompt={modality: 1},
     )
@@ -1820,6 +1845,7 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
     "glm4_5v": run_glm4_5v,
     "glm4_5v_fp8": run_glm4_5v_fp8,
     "h2ovl_chat": run_h2ovl,
+    "hunyuan_vl": run_hunyuan_vl,
     "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
     "idefics3": run_idefics3,
     "interns1": run_interns1,
@@ -2038,6 +2064,13 @@ def parse_args():
         help="If True, will send all requests in a second batch with empty mm "
         "data to verify cache hits with UUIDs.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -2046,6 +2079,12 @@ def main(args):
     if model not in model_example_map:
         raise ValueError(f"Model type {model} is not supported.")
 
+    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {args.tensor_parallel_size}"
+        )
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
@@ -2063,6 +2102,8 @@ def main(args):
         "seed": args.seed,
         "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
     }
+    if args.tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
     llm = LLM(**engine_args)
 
     # Don't want to check the flag multiple times, so just hijack `prompts`.
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
old mode 100644
new mode 100755
index d6e169548f15..7ba4e64b567d
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -1110,6 +1110,7 @@ def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
         model=model_name,
         max_model_len=16384,
         max_num_seqs=16,
+        trust_remote_code=True,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
@@ -1351,10 +1352,18 @@ def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
 }
 
 
-def run_generate(model, question: str, image_urls: list[str], seed: int | None):
+def run_generate(
+    model,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
-    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = SamplingParams(
@@ -1377,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
         print("-" * 50)
 
 
-def run_chat(model: str, question: str, image_urls: list[str], seed: int | None):
+def run_chat(
+    model: str,
+    question: str,
+    image_urls: list[str],
+    seed: int | None,
+    tensor_parallel_size: int | None,
+):
     req_data = model_example_map[model](question, image_urls)
 
     # Disable other modalities to save memory
@@ -1387,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
     )
 
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    if tensor_parallel_size is not None:
+        engine_args["tensor_parallel_size"] = tensor_parallel_size
     llm = LLM(**engine_args)
 
     sampling_params = (
@@ -1462,6 +1479,13 @@ def parse_args():
         default=2,
         help="Number of images to use for the demo.",
     )
+    parser.add_argument(
+        "--tensor-parallel-size",
+        "-tp",
+        type=int,
+        default=None,
+        help="Tensor parallel size to override the model's default setting. ",
+    )
     return parser.parse_args()
 
 
@@ -1469,13 +1493,20 @@ def main(args: Namespace):
     model = args.model_type
     method = args.method
     seed = args.seed
+    tensor_parallel_size = args.tensor_parallel_size
+
+    if tensor_parallel_size is not None and tensor_parallel_size < 1:
+        raise ValueError(
+            f"tensor_parallel_size must be a positive integer, "
+            f"got {tensor_parallel_size}"
+        )
 
     image_urls = IMAGE_URLS[: args.num_images]
 
     if method == "generate":
-        run_generate(model, QUESTION, image_urls, seed)
+        run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
     elif method == "chat":
-        run_chat(model, QUESTION, image_urls, seed)
+        run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
     else:
         raise ValueError(f"Invalid method: {method}")
 
diff --git a/examples/offline_inference/vision_language_pooling.py b/examples/offline_inference/vision_language_pooling.py
index 63d85d5d9eef..530aad4bc031 100644
--- a/examples/offline_inference/vision_language_pooling.py
+++ b/examples/offline_inference/vision_language_pooling.py
@@ -266,7 +266,7 @@ def get_query(modality: QueryModality):
         return ImageQuery(
             modality="image",
             image=fetch_image(
-                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
+                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"  # noqa: E501
             ),
         )
 
@@ -275,7 +275,7 @@ def get_query(modality: QueryModality):
             modality="text+image",
             text="A cat standing in the snow.",
             image=fetch_image(
-                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
+                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"  # noqa: E501
             ),
         )
 
diff --git a/examples/online_serving/disaggregated_prefill.sh b/examples/online_serving/disaggregated_prefill.sh
index d434e22b1ae8..cd2f2e44a4d6 100644
--- a/examples/online_serving/disaggregated_prefill.sh
+++ b/examples/online_serving/disaggregated_prefill.sh
@@ -24,7 +24,14 @@ cleanup() {
     exit 0
 }
 
-export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+if [[ -z "${VLLM_HOST_IP:-}" ]]; then
+    export VLLM_HOST_IP=127.0.0.1
+    echo "Using default VLLM_HOST_IP=127.0.0.1 (override by exporting VLLM_HOST_IP before running this script)"
+else
+    echo "Using provided VLLM_HOST_IP=${VLLM_HOST_IP}"
+fi
+
 
 # install quart first -- required for disagg prefill proxy serve
 if python3 -c "import quart" &> /dev/null; then
@@ -38,7 +45,7 @@ fi
 wait_for_server() {
   local port=$1
   timeout 1200 bash -c "
-    until curl -s localhost:${port}/v1/completions > /dev/null; do
+    until curl -i localhost:${port}/v1/models > /dev/null; do
       sleep 1
     done" && return 0 || return 1
 }
@@ -48,21 +55,23 @@ wait_for_server() {
 
 # prefilling instance, which is the KV producer
 CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
     --port 8100 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":"1e9","kv_port":"14579","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8100","send_type":"PUT_ASYNC"}}' &
 
-# decoding instance, which is the KV consumer
+# decoding instance, which is the KV consumer  
 CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
+    --host 0.0.0.0 \
     --port 8200 \
     --max-model-len 100 \
     --gpu-memory-utilization 0.8 \
     --trust-remote-code \
     --kv-transfer-config \
-    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
+    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":"1e10","kv_port":"14580","kv_connector_extra_config":{"proxy_ip":"'"$VLLM_HOST_IP"'","proxy_port":"30001","http_ip":"'"$VLLM_HOST_IP"'","http_port":"8200","send_type":"PUT_ASYNC"}}' &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index d5d0a07a2918..c76c60cc4472 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -25,25 +25,17 @@
 from openai import OpenAI
 
 
-def format_history_to_openai(history):
-    history_openai_format = [
-        {"role": "system", "content": "You are a great AI assistant."}
-    ]
-    for human, assistant in history:
-        history_openai_format.append({"role": "user", "content": human})
-        history_openai_format.append({"role": "assistant", "content": assistant})
-    return history_openai_format
-
-
 def predict(message, history, client, model_name, temp, stop_token_ids):
-    # Format history to OpenAI chat format
-    history_openai_format = format_history_to_openai(history)
-    history_openai_format.append({"role": "user", "content": message})
+    messages = [
+        {"role": "system", "content": "You are a great AI assistant."},
+        *history,
+        {"role": "user", "content": message},
+    ]
 
     # Send request to OpenAI API (vLLM server)
     stream = client.chat.completions.create(
         model=model_name,
-        messages=history_openai_format,
+        messages=messages,
         temperature=temp,
         stream=True,
         extra_body={
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 520cbca003aa..3d1259276998 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -66,7 +66,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None:
 # Single-image input inference
 def run_single_image(model: str, max_completion_tokens: int) -> None:
     ## Use image url in the payload
-    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
     chat_completion_from_url = client.chat.completions.create(
         messages=[
             {
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 41dbb3236297..0bd1d05322f8 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -161,6 +161,7 @@ def main():
         {
             "role": "assistant",
             "tool_calls": chat_completion.choices[0].message.tool_calls,
+            "reasoning": chat_completion.choices[0].message.reasoning,
         }
     )
 
diff --git a/examples/online_serving/openai_embedding_long_text/service.sh b/examples/online_serving/openai_embedding_long_text/service.sh
index 1577de85f7ff..b5c92749466b 100644
--- a/examples/online_serving/openai_embedding_long_text/service.sh
+++ b/examples/online_serving/openai_embedding_long_text/service.sh
@@ -22,7 +22,6 @@ API_KEY=${API_KEY:-"your-api-key"}
 POOLING_TYPE=${POOLING_TYPE:-"auto"}  # auto, MEAN, CLS, LAST
 export VLLM_ENABLE_CHUNKED_PROCESSING=true
 export CUDA_VISIBLE_DEVICES=2,3,4,5
-# export VLLM_ATTENTION_BACKEND=XFORMERS
 
 echo "🚀 Starting vLLM Embedding Server with Enhanced Chunked Processing"
 echo "=================================================================="
diff --git a/examples/online_serving/openai_responses_client.py b/examples/online_serving/openai_responses_client.py
new file mode 100644
index 000000000000..b4eb24671507
--- /dev/null
+++ b/examples/online_serving/openai_responses_client.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Set up this example by starting a vLLM OpenAI-compatible server.
+Reasoning models can be used through the Responses API as seen here
+https://platform.openai.com/docs/api-reference/responses
+For example:
+vllm serve Qwen/Qwen3-8B --reasoning-parser qwen3
+
+"""
+
+from openai import OpenAI
+
+input_messages = [{"role": "user", "content": "What model are you?"}]
+
+
+def main():
+    base_url = "http://localhost:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = "Qwen/Qwen3-8B"  # get_first_model(client)
+    response = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+
+    for message in response.output:
+        if message.type == "reasoning":
+            # append reasoning message
+            input_messages.append(message)
+
+    response_2 = client.responses.create(
+        model=model,
+        input=input_messages,
+    )
+    print(response_2.output_text)
+    # I am Qwen, a large language model developed by Alibaba Cloud.
+    # I am designed to assist with a wide range of tasks, including
+    # answering questions, creating content, coding, and engaging in
+    # conversations. I can help with various topics and provide
+    # information or support in multiple languages. How can I assist you today?
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_responses_client_with_mcp_tools.py b/examples/online_serving/openai_responses_client_with_mcp_tools.py
new file mode 100644
index 000000000000..cafe19a2d195
--- /dev/null
+++ b/examples/online_serving/openai_responses_client_with_mcp_tools.py
@@ -0,0 +1,184 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Example demonstrating MCP (Model Context Protocol) tools with the Responses API.
+
+This example shows how to use MCP tools with different allowed_tools configurations:
+1. No filter (allows all tools from the MCP server)
+2. Wildcard "*" (explicitly allows all tools)
+3. Specific tool names (filters to only those tools)
+
+Set up this example by starting a vLLM OpenAI-compatible server with MCP tools enabled.
+For example:
+vllm serve openai/gpt-oss-20b --enforce-eager --tool-server demo
+
+Environment variables:
+- VLLM_ENABLE_RESPONSES_API_STORE=1
+- VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS=code_interpreter,container
+- VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS=1
+"""
+
+from openai import OpenAI
+from utils import get_first_model
+
+
+def example_no_filter():
+    """Example with no allowed_tools filter - allows all tools."""
+    print("=" * 60)
+    print("Example 1: No allowed_tools filter (allows all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # No allowed_tools specified - all tools are available
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_wildcard():
+    """Example with allowed_tools=['*'] - explicitly allows all tools."""
+    print("=" * 60)
+    print("Example 2: allowed_tools=['*'] (select all tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Execute this code: print('Hello from Python with wildcard!')",
+        instructions="Use the Python tool to execute code.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to explicitly allow all tools from this MCP server
+                # This is equivalent to not specifying allowed_tools
+                "allowed_tools": ["*"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_specific_tools():
+    """Example with specific allowed_tools list - filters available tools.
+
+    Note: This example uses 'web_search_preview' (browser) which has multiple
+    sub-tools: 'search', 'open', 'find'. The code_interpreter (python) doesn't
+    have sub-tools, so filtering doesn't apply there.
+    """
+    print("=" * 60)
+    print("Example 3: allowed_tools=['search'] (filter browser to specific tools)")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'Python programming tutorials'",
+        instructions="Use the browser tool to search.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Browser has tools: 'search', 'open', 'find'
+                # Only allow 'search' - blocks 'open' and 'find'
+                "allowed_tools": ["search"],
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def example_object_format():
+    """Example using object format for allowed_tools with browser tools."""
+    print("=" * 60)
+    print("Example 4: allowed_tools with object format")
+    print("=" * 60)
+
+    base_url = "http://0.0.0.0:8000/v1"
+    client = OpenAI(base_url=base_url, api_key="empty")
+    model = get_first_model(client)
+
+    response = client.responses.create(
+        model=model,
+        input="Search for 'machine learning' and open the first result",
+        instructions="Use the browser tool.",
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "web_search_preview",
+                "server_url": "http://localhost:8888",
+                # Object format with tool_names field
+                # Can also include read_only and other fields
+                # Browser has tools: 'search', 'open', 'find'
+                "allowed_tools": {
+                    "tool_names": [
+                        "search",
+                        "open",
+                    ],  # Allow search and open, block find
+                    "read_only": False,
+                },
+            }
+        ],
+    )
+
+    print(f"Status: {response.status}")
+    print(f"Output: {response.output_text}")
+    print()
+
+
+def main():
+    """Run all examples."""
+    print("\n" + "=" * 60)
+    print("MCP Tools with allowed_tools Examples")
+    print("=" * 60 + "\n")
+
+    # Run all examples
+    example_no_filter()
+    example_wildcard()
+    example_specific_tools()
+    example_object_format()
+
+    print("=" * 60)
+    print("Summary:")
+    print("  - No filter or '*' → All tools available from server")
+    print("  - Specific list → Only those sub-tools available")
+    print("  - Object format → More control with tool_names field")
+    print("")
+    print("Note: allowed_tools filters SUB-TOOLS within an MCP server:")
+    print("  - code_interpreter (python): No sub-tools to filter")
+    print("  - web_search_preview (browser): Has 'search', 'open', 'find'")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
index 261b810ce5d0..47c2c5030078 100644
--- a/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_chat_embedding_client_for_multimodal.py
@@ -21,7 +21,7 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 
 def create_chat_embeddings(
diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md
index 5cd4dab5a8fa..9615210a2ad8 100644
--- a/examples/online_serving/prometheus_grafana/README.md
+++ b/examples/online_serving/prometheus_grafana/README.md
@@ -46,7 +46,7 @@ Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the de
 
 Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
 
-On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
+On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each container. You can just use `http://prometheus:9090`.
 
 Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
 
diff --git a/examples/online_serving/token_generation_client.py b/examples/online_serving/token_generation_client.py
new file mode 100644
index 000000000000..88ee43c5d9cd
--- /dev/null
+++ b/examples/online_serving/token_generation_client.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import httpx
+from transformers import AutoTokenizer
+
+GEN_ENDPOINT = "http://localhost:8000/inference/v1/generate"
+DUMMY_API_KEY = "empty"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+transport = httpx.HTTPTransport()
+headers = {"Authorization": f"Bearer {DUMMY_API_KEY}"}
+client = httpx.Client(
+    transport=transport,
+    base_url=GEN_ENDPOINT,
+    timeout=600,
+    headers=headers,
+)
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "How many countries are in the EU?"},
+]
+
+
+def main(client):
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {"max_tokens": 24, "temperature": 0.2, "detokenize": False},
+        "stream": False,
+    }
+    resp = client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    print(data)
+    print("-" * 50)
+    print("Token generation results:")
+    res = tokenizer.decode(data["choices"][0]["token_ids"])
+    print(res)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    main(client)
diff --git a/format.sh b/format.sh
deleted file mode 100755
index 6ba93e0a19ba..000000000000
--- a/format.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-
-echo "vLLM linting system has been moved from format.sh to pre-commit hooks."
-echo "Please run 'pip install -r requirements/lint.txt', followed by"
-echo "'pre-commit install' to install the pre-commit hooks."
-echo "Then linters will run automatically before each commit."
\ No newline at end of file
diff --git a/requirements/common.txt b/requirements/common.txt
index 90efb79a845d..8b9e6b935bd2 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -19,19 +19,19 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.11.3
-llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x"
+llguidance >= 1.3.0, < 1.4.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 outlines_core == 0.2.11
 # required for outlines backend disk cache
 diskcache == 5.6.3
 lark == 1.2.2
-xgrammar == 0.1.25; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x"
+xgrammar == 0.1.27; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
 partial-json-parser # used for parsing partial JSON outputs
 pyzmq >= 25.0.0
 msgspec
-gguf >= 0.13.0
-mistral_common[image,audio] >= 1.8.5
+gguf >= 0.17.0
+mistral_common[image] >= 1.8.5
 opencv-python-headless >= 4.11.0    # required for video IO
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
@@ -49,4 +49,4 @@ cbor2 # Required for cross-language serialization of hashable objects
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0
-model-hosting-container-standards < 1.0.0
\ No newline at end of file
+model-hosting-container-standards >= 0.1.9, < 1.0.0
\ No newline at end of file
diff --git a/requirements/cpu-build.txt b/requirements/cpu-build.txt
index 331d02be6621..e18e0825fc42 100644
--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -4,8 +4,8 @@ packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
+torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)
 wheel
 jinja2>=3.1.6
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 605ce73bff9c..21571be479c8 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -4,26 +4,18 @@
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
 
 # Dependencies for CPUs
-packaging>=24.2
-setuptools>=77.0.3,<81.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.8.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
-torch==2.8.0; platform_system == "Darwin"
-torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
+torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
+torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.8.0; platform_machine == "ppc64le"
+torchaudio; platform_machine != "s390x"
 
 # required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.23.0; platform_machine == "ppc64le"
-datasets # for benchmark scripts
+torchvision; platform_machine != "s390x"
 
 # Intel Extension for PyTorch, only for x86_64 CPUs
 intel-openmp==2024.2.1; platform_machine == "x86_64"
-intel_extension_for_pytorch==2.8.0; platform_machine == "x86_64"
-triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
 
 # Use this to gather CPU info and optimize based on ARM Neoverse cores
 py-cpuinfo; platform_machine == "aarch64"
diff --git a/requirements/cuda.txt b/requirements/cuda.txt
index 76874cbd2f48..462f18ef7159 100644
--- a/requirements/cuda.txt
+++ b/requirements/cuda.txt
@@ -9,6 +9,5 @@ torch==2.9.0
 torchaudio==2.9.0
 # These must be updated alongside torch
 torchvision==0.24.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers==0.0.33; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.9
 # FlashInfer should be updated together with the Dockerfile
-flashinfer-python==0.5.2
+flashinfer-python==0.5.3
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 0fd6dbe22c51..32e004b2b64b 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -10,3 +10,7 @@ mkdocs-minify-plugin
 regex
 ruff
 pydantic
+
+# For generating argparse docs.
+# Adding requirements here should only be used as a last resort.
+msgspec  # Need for multiple inheritance involving msgspec.Struct
\ No newline at end of file
diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt
index b1f3269cd381..083230c17109 100644
--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
@@ -1,2 +1,2 @@
 lmcache
-nixl >= 0.6.0 # Required for disaggregated prefill
+nixl >= 0.7.1 # Required for disaggregated prefill
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index d9c5d89c1d52..53b012372be8 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -29,7 +29,7 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
 mteb>=1.38.11, <2 # required for mteb test
-transformers==4.57.1
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
index 51f58e57a785..b977e80be067 100644
--- a/requirements/rocm-build.txt
+++ b/requirements/rocm-build.txt
@@ -13,5 +13,5 @@ setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
 wheel
 jinja2>=3.1.6
-amdsmi==6.2.4
+amdsmi==6.4.3
 timm>=1.0.17
diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt
index 432e11977872..ae61d4c6c6a8 100644
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -1,41 +1,85 @@
 # Common dependencies
 -r common.txt
+
+# Test infrastructure
 tblib==3.1.0
-bm25s==0.2.13
-pystemmer==3.0.0
+pytest==8.3.5
+pytest-asyncio==0.24.0
+pytest-timeout==2.3.1
+pytest-cov==6.3.0
+pytest-forked==1.6.0
+pytest-rerunfailures==14.0
+pytest-shard==0.1.2
+
+# Async/HTTP dependencies
+anyio==4.6.2.post1
+    # via httpx, starlette
+aiohttp==3.13.0
+    # via gpt-oss
+httpx==0.27.2
+    # HTTP testing
 
-# Entrypoints test
-# librosa==0.10.2.post1 # required by audio tests in entrypoints/openai
+# Audio processing dependencies
 audioread==3.0.1
+    # via librosa
 cffi==1.17.1
+    # via soundfile
 decorator==5.2.1
+    # via librosa
 lazy-loader==0.4
+    # via librosa
 platformdirs==4.3.6
+    # via pooch
 pooch==1.8.2
-#pycparse==2.22
+    # via librosa
 soundfile==0.13.1
+    # via librosa
 soxr==0.5.0.post1
+    # via librosa
 librosa==0.10.2.post1
 
-# Entrypoints test
-#vllm[video] # required by entrypoints/openai/test_video.py
+# Retrieval and search
+bm25s==0.2.13
+    # via mteb
+pystemmer==3.0.0
+    # via mteb
+
+# Multi-modal processing
+blobfile==3.0.0
+    # Multi-Modal Models Test
 decord==0.6.0
+    # video processing, required by entrypoints/openai/test_video.py
 
-# Entrypoints test
-#sentence-transformers # required by entrypoints/openai/test_score.py
-sentence-transformers==3.4.1
+# OpenAI compatibility and testing
+gpt-oss==0.0.8
+    # OpenAI compatibility tests
+schemathesis==3.39.15
+    # OpenAI schema test
 
-# Basic Models Test
-matplotlib==3.10.3
+# Evaluation and benchmarking
+lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
 
-# Multi-Modal Models Test (Extended) 3
-blobfile==3.0.0
+# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
+multiprocess==0.70.16
 
-# Required for openai schema test.
-schemathesis==3.39.15
+# Plugins test
+terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
+torchgeo==0.7.0
+    # via terratorch
+# MTEB Benchmark Test
+mteb==2.1.2
 
-# Required for mteb test
-mteb[bm25s]>=1.38.11, <2
+# Data processing
+xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
+    # Test async scheduling
 
-# Required for eval tests
-lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+# Utilities
+num2words==0.5.14
+    # via lm-eval
+pqdm==0.2.0
+    # via lm-eval
+
+# Required for suffix decoding test
+arctic-inference == 0.1.1
+# Required for Nemotron test
+open-clip-torch==2.32.0
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 6f1cca90e5e2..abbd33d6e124 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -15,3 +15,4 @@ setuptools-scm>=8
 runai-model-streamer[s3,gcs]==0.15.0
 conch-triton-kernels==1.2.1
 timm>=1.0.17
+fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
diff --git a/requirements/test.in b/requirements/test.in
index 30d97e9b9c7d..da7a7db1f00c 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -36,8 +36,8 @@ opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 # TODO: Use lm-eval[api]==0.4.10 once released
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
-mteb[bm25s]>=1.38.11, <2 # required for mteb test
-transformers==4.57.1
+mteb[bm25s]>=2, <3 # required for mteb test
+transformers==4.57.3
 tokenizers==0.22.0
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
diff --git a/requirements/test.txt b/requirements/test.txt
index 3263b74c0879..c5f103b8b0d7 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -201,8 +201,6 @@ email-validator==2.2.0
     # via pydantic
 encodec==0.1.1
     # via vocos
-eval-type-backport==0.2.2
-    # via mteb
 evaluate==0.4.3
     # via lm-eval
 fastapi==0.116.1
@@ -490,7 +488,7 @@ msgpack==1.1.0
     # via
     #   librosa
     #   ray
-mteb==1.38.11
+mteb==2.1.2
     # via -r requirements/test.in
 multidict==6.1.0
     # via
@@ -1198,7 +1196,7 @@ tqdm==4.66.6
     #   transformers
 tqdm-multiprocess==0.0.11
     # via lm-eval
-transformers==4.57.1
+transformers==4.57.3
     # via
     #   -r requirements/test.in
     #   genai-perf
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 4241cbb2b033..e6fff58f7b79 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -12,6 +12,4 @@ ray[data]
 setuptools==78.1.0
 nixl==0.3.0
 tpu_info==0.4.0
-
-# Install torch_xla
-torch_xla[tpu, pallas]==2.8.0
\ No newline at end of file
+tpu-inference==0.11.1
diff --git a/requirements/xpu.txt b/requirements/xpu.txt
index 59ea710684a2..c1dc4195b523 100644
--- a/requirements/xpu.txt
+++ b/requirements/xpu.txt
@@ -10,9 +10,9 @@ wheel
 jinja2>=3.1.6
 datasets # for benchmark scripts
 numba == 0.61.2 # Required for N-gram speculative decoding
-torch==2.8.0+xpu
+--extra-index-url=https://download.pytorch.org/whl/xpu
+torch==2.9.0+xpu
 torchaudio
 torchvision
---extra-index-url=https://download.pytorch.org/whl/xpu
 
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.8.10.post1%2Bxpu-cp312-cp312-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.9.10.post0%2Bxpu-cp312-cp312-linux_x86_64.whl
diff --git a/setup.py b/setup.py
index 0934a8608eb1..67226b4447c7 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@
 import shutil
 import subprocess
 import sys
+import sysconfig
 from pathlib import Path
 from shutil import which
 
@@ -74,21 +75,13 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
-def is_url_available(url: str) -> bool:
-    from urllib.request import urlopen
-
-    status = None
-    try:
-        with urlopen(url) as f:
-            status = f.status
-    except Exception:
-        return False
-    return status == 200
+def is_freethreaded():
+    return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
 
 
 class CMakeExtension(Extension):
     def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
-        super().__init__(name, sources=[], py_limited_api=True, **kwa)
+        super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
 
@@ -299,13 +292,24 @@ def run(self):
             os.makedirs(os.path.dirname(dst_file), exist_ok=True)
             self.copy_file(file, dst_file)
 
+        if _is_cuda() or _is_hip():
+            # copy vllm/third_party/triton_kernels/**/*.py from self.build_lib
+            # to current directory so that they can be included in the editable
+            # build
+            print(
+                f"Copying {self.build_lib}/vllm/third_party/triton_kernels "
+                "to vllm/third_party/triton_kernels"
+            )
+            shutil.copytree(
+                f"{self.build_lib}/vllm/third_party/triton_kernels",
+                "vllm/third_party/triton_kernels",
+                dirs_exist_ok=True,
+            )
+
 
 class precompiled_build_ext(build_ext):
     """Disables extension building when using precompiled binaries."""
 
-    def run(self) -> None:
-        assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-
     def build_extensions(self) -> None:
         print("Skipping build_ext: using precompiled extensions.")
         return
@@ -519,33 +523,13 @@ def get_nvcc_cuda_version() -> Version:
     return nvcc_cuda_version
 
 
-def get_gaudi_sw_version():
-    """
-    Returns the driver version.
-    """
-    # Enable console printing for `hl-smi` check
-    output = subprocess.run(
-        "hl-smi",
-        shell=True,
-        text=True,
-        capture_output=True,
-        env={"ENABLE_CONSOLE": "true"},
-    )
-    if output.returncode == 0 and output.stdout:
-        return (
-            output.stdout.split("\n")[2]
-            .replace(" ", "")
-            .split(":")[1][:-1]
-            .split("-")[0]
-        )
-    return "0.0.0"  # when hl-smi is not available
-
-
 def get_vllm_version() -> str:
     # Allow overriding the version. This is useful to build platform-specific
     # wheels (e.g. CPU, TPU) without modifying the source.
     if env_version := os.getenv("VLLM_VERSION_OVERRIDE"):
-        return env_version
+        print(f"Overriding VLLM version with {env_version} from VLLM_VERSION_OVERRIDE")
+        os.environ["SETUPTOOLS_SCM_PRETEND_VERSION"] = env_version
+        return get_version(write_to="vllm/_version.py")
 
     version = get_version(write_to="vllm/_version.py")
     sep = "+" if "+" not in version else "."  # dev versions might contain +
@@ -631,6 +615,9 @@ def _read_requirements(filename: str) -> list[str]:
 if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
+    # Optional since this doesn't get built (produce an .so file). This is just
+    # copying the relevant .py files from the source repository.
+    ext_modules.append(CMakeExtension(name="vllm.triton_kernels", optional=True))
 
 if _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
@@ -658,37 +645,97 @@ def _read_requirements(filename: str) -> list[str]:
     ]
 }
 
+
+def _fetch_metadata_for_variant(
+    commit: str, variant: str | None
+) -> tuple[list[dict], str]:
+    variant_dir = f"{variant}/" if variant is not None else ""
+    repo_url = f"https://wheels.vllm.ai/{commit}/{variant_dir}vllm/"
+    meta_url = repo_url + "metadata.json"
+    logger.info("Trying to fetch metadata from {}", meta_url)
+    from urllib.request import urlopen
+
+    with urlopen(meta_url) as resp:
+        # urlopen raises HTTPError on unexpected status code
+        wheels = json.loads(resp.read().decode("utf-8"))
+    return wheels, repo_url
+
+
 # If using precompiled, extract and patch package_data (in advance of setup)
 if envs.VLLM_USE_PRECOMPILED:
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+    # Attempts:
+    # 1. user-specified wheel location (can be either local or remote, via
+    #    VLLM_PRECOMPILED_WHEEL_LOCATION)
+    # 2. user-specified variant from nightly repo (current main commit via
+    #    VLLM_PRECOMPILED_WHEEL_VARIANT)
+    # 3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
+    # 4. the default variant from nightly repo (current main commit)
     wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
     if wheel_location is not None:
         wheel_url = wheel_location
+        logger.info("Using user-specified precompiled wheel location: {}", wheel_url)
     else:
         import platform
 
         arch = platform.machine()
-        if arch == "x86_64":
-            wheel_tag = "manylinux1_x86_64"
-        elif arch == "aarch64":
-            wheel_tag = "manylinux2014_aarch64"
-        else:
-            raise ValueError(f"Unsupported architecture: {arch}")
-        base_commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
-        wheel_url = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
-        nightly_wheel_url = (
-            f"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-{wheel_tag}.whl"
+        # try to fetch the wheel metadata from the nightly wheel repo
+        main_variant = envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
+        variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
+        commit = os.getenv(
+            "VLLM_PRECOMPILED_WHEEL_COMMIT",
+            precompiled_wheel_utils.get_base_commit_in_main_branch(),
         )
-        from urllib.request import urlopen
-
+        logger.info(
+            "Using precompiled wheel commit {} with variant {}", commit, variant
+        )
+        try_default = False
+        wheels, repo_url = None, None
         try:
-            with urlopen(wheel_url) as resp:
-                if resp.status != 200:
-                    wheel_url = nightly_wheel_url
+            wheels, repo_url = _fetch_metadata_for_variant(commit, variant)
         except Exception as e:
-            print(f"[warn] Falling back to nightly wheel: {e}")
-            wheel_url = nightly_wheel_url
-
+            logger.warning(
+                "Failed to fetch precompiled wheel metadata for variant {}",
+                variant,
+                exc_info=e,
+            )
+            try_default = True  # try outside handler to keep the stacktrace simple
+        if try_default:
+            logger.info("Trying the default variant")
+            wheels, repo_url = _fetch_metadata_for_variant(commit, None)
+            # if this also fails, then we have nothing more to try / cache
+        assert wheels is not None and repo_url is not None, (
+            "Failed to fetch precompiled wheel metadata"
+        )
+        # The metadata.json has the following format:
+        # see .buildkite/scripts/generate-nightly-index.py for details
+        """[{
+"package_name": "vllm",
+"version": "0.11.2.dev278+gdbc3d9991",
+"build_tag": null,
+"python_tag": "cp38",
+"abi_tag": "abi3",
+"platform_tag": "manylinux1_x86_64",
+"variant": null,
+"filename": "vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl",
+"path": "../vllm-0.11.2.dev278+gdbc3d9991-cp38-abi3-manylinux1_x86_64.whl"
+},
+...]"""
+        for wheel in wheels:
+            if wheel.get("package_name") == "vllm" and arch in wheel.get(
+                "platform_tag", ""
+            ):
+                logger.info("Found precompiled wheel metadata: {}", wheel)
+                if "path" not in wheel:
+                    raise ValueError(f"Wheel metadata missing path: {wheel}")
+                # TODO: maybe check more compatibility later? (python_tag, abi_tag, etc)
+                wheel_url = repo_url + wheel["path"]
+                logger.info("Using precompiled wheel URL: {}", wheel_url)
+                break
+        else:
+            raise ValueError(
+                f"No precompiled vllm wheel found for architecture {arch} "
+                f"from repo {repo_url}. All available wheels: {wheels}"
+            )
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(wheel_url)
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 0cf1e85d4e8e..521d6c33dd39 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -74,9 +74,6 @@ def test_models(
     model_executor: str,
     enable_prompt_embeds: bool,
 ) -> None:
-    if backend == "XFORMERS" and model == "google/gemma-2-2b-it":
-        pytest.skip(f"{backend} does not support gemma2 with full context length.")
-
     with monkeypatch.context() as m:
         m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
index 754ef20dbeb2..dc9c69bf58b9 100644
--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
@@ -11,7 +11,7 @@
 from vllm.platforms import current_platform
 from vllm.utils.mem_constants import GiB_bytes
 
-from ..utils import create_new_process_for_each_test
+from ..utils import create_new_process_for_each_test, requires_fp8
 
 
 @create_new_process_for_each_test("fork" if not current_platform.is_rocm() else "spawn")
@@ -243,3 +243,34 @@ async def test():
         assert output.outputs[0].text == output2.outputs[0].text
 
     asyncio.run(test())
+
+
+@requires_fp8
+def test_deep_sleep_fp8_kvcache():
+    GiB_bytes = 1 << 30
+    model = "Qwen/Qwen2-0.5B"
+    used_bytes_baseline = current_platform.get_current_memory_usage()
+
+    llm = LLM(model, enable_sleep_mode=True, kv_cache_dtype="fp8")
+    prompt = "How are you?"
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+    output = llm.generate(prompt, sampling_params)
+
+    # Put the engine to deep sleep
+    llm.sleep(level=2)
+
+    used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
+    assert used_bytes < 3 * GiB_bytes
+
+    llm.wake_up(tags=["weights"])
+    llm.collective_rpc("reload_weights")
+
+    used_bytes = current_platform.get_current_memory_usage() - used_bytes_baseline
+    assert used_bytes < 4 * GiB_bytes
+
+    # now allocate kv cache and cuda graph memory
+    llm.wake_up(tags=["kv_cache"])
+    output2 = llm.generate(prompt, sampling_params)
+
+    # cmp output
+    assert output[0].outputs[0].text == output2[0].outputs[0].text
diff --git a/tests/compile/README.md b/tests/compile/README.md
new file mode 100644
index 000000000000..300a95686000
--- /dev/null
+++ b/tests/compile/README.md
@@ -0,0 +1,5 @@
+# compile test folder structure
+
+- `compile/test_*.py` : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically
+- `compile/fullgraph/` : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically
+- `compile/distributed/` : tests that require multiple GPUs. New test files added here will **NOT** be included in CI automatically as these tests generally need to be manually configured to run in runners with particular number/type of GPUs.
diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/distributed/__init__.py
similarity index 100%
rename from tests/compile/piecewise/__init__.py
rename to tests/compile/distributed/__init__.py
diff --git a/tests/compile/test_async_tp.py b/tests/compile/distributed/test_async_tp.py
similarity index 99%
rename from tests/compile/test_async_tp.py
rename to tests/compile/distributed/test_async_tp.py
index 71ee22878143..86d409f1eadb 100644
--- a/tests/compile/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -27,13 +27,13 @@
 from vllm.platforms import current_platform
 from vllm.utils.system_utils import update_environment_variables
 
-from ..models.registry import HF_EXAMPLE_MODELS
-from ..utils import (
+from ...models.registry import HF_EXAMPLE_MODELS
+from ...utils import (
     compare_two_settings,
     create_new_process_for_each_test,
     multi_gpu_test,
 )
-from .backend import TestBackend
+from ..backend import TestBackend
 
 FP8_DTYPE = current_platform.fp8_dtype()
 
diff --git a/tests/compile/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py
similarity index 100%
rename from tests/compile/test_fusion_all_reduce.py
rename to tests/compile/distributed/test_fusion_all_reduce.py
diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
similarity index 57%
rename from tests/compile/test_fusions_e2e.py
rename to tests/compile/distributed/test_fusions_e2e.py
index e1560efb3f24..661172e1965b 100644
--- a/tests/compile/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -18,15 +18,24 @@
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ..utils import flat_product, multi_gpu_test
+from ...utils import flat_product, multi_gpu_test
+
+is_blackwell = lambda: current_platform.is_device_capability(100)
+"""Are we running on Blackwell, a lot of tests depend on it"""
+
+
+class Matches(NamedTuple):
+    attention_fusion: int = 0
+    allreduce_fusion: int = 0
+    sequence_parallel: int = 0
+    async_tp: int = 0
 
 
 class ModelBackendTestCase(NamedTuple):
     model_name: str
     model_kwargs: dict[str, Any]
     backend: AttentionBackendEnum
-    attention_fusions: int
-    allreduce_fusions: int | None = None
+    matches: Matches
 
 
 MODELS_FP8: list[ModelBackendTestCase] = []
@@ -38,17 +47,29 @@ class ModelBackendTestCase(NamedTuple):
         ModelBackendTestCase(
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024),
+            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=32,
-            allreduce_fusions=65,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
         ModelBackendTestCase(
             model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            attention_fusions=48,
-            allreduce_fusions=96,
+            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
+            # https://github.com/vllm-project/vllm/issues/28568
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
+            matches=Matches(
+                attention_fusion=48,
+                allreduce_fusion=96,
+                sequence_parallel=96,
+                async_tp=95,  # mlp is moe, no fusion there
+            ),
         ),
     ]
 
@@ -57,8 +78,12 @@ class ModelBackendTestCase(NamedTuple):
             model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             backend=AttentionBackendEnum.FLASHINFER,
-            attention_fusions=32,
-            allreduce_fusions=65,
+            matches=Matches(
+                attention_fusion=32,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
     ]
 
@@ -68,15 +93,23 @@ class ModelBackendTestCase(NamedTuple):
             model_name="meta-llama/Llama-3.1-8B-Instruct",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=0,
-            allreduce_fusions=65,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=65,
+                sequence_parallel=65,
+                async_tp=128,
+            ),
         ),
         ModelBackendTestCase(
             model_name="Qwen/Qwen3-30B-A3B",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=0,
-            allreduce_fusions=97,
+            matches=Matches(
+                attention_fusion=0,
+                allreduce_fusion=97,
+                sequence_parallel=97,
+                async_tp=96,  # MLP is MoE, half the fusions of dense
+            ),
         ),
     ]
 
@@ -86,19 +119,19 @@ class ModelBackendTestCase(NamedTuple):
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.TRITON_ATTN,
-            attention_fusions=32,
+            matches=Matches(attention_fusion=32),
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.ROCM_ATTN,
-            attention_fusions=32,
+            matches=Matches(attention_fusion=32),
         ),
         ModelBackendTestCase(
             model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
             model_kwargs=dict(max_model_len=1024),
             backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
-            attention_fusions=32,
+            matches=Matches(attention_fusion=32),
         ),
     ]
 
@@ -106,8 +139,7 @@ class ModelBackendTestCase(NamedTuple):
 
 
 @pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, "
-    "attention_fusions, allreduce_fusions, custom_ops",
+    "model_name, model_kwargs, backend, matches, custom_ops",
     # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
     list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
     # quant_fp4 only has the custom impl
@@ -118,15 +150,14 @@ def test_attn_quant(
     model_name: str,
     model_kwargs: dict[str, Any],
     backend: AttentionBackendEnum,
-    attention_fusions: int,
-    allreduce_fusions: int,
+    matches: Matches,
     custom_ops: str,
     inductor_graph_partition: bool,
     caplog_mp_spawn,
     monkeypatch,
 ):
     if backend == AttentionBackendEnum.FLASHINFER and (
-        not current_platform.is_device_capability((10, 0)) or not has_flashinfer()
+        not is_blackwell() or not has_flashinfer()
     ):
         pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
     if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
@@ -169,12 +200,12 @@ def test_attn_quant(
     with caplog_mp_spawn(logging.DEBUG) as log_holder:
         run_model(compilation_config, model_name, **model_kwargs)
 
-    matches = re.findall(
+    log_matches = re.findall(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(matches) == 1, log_holder.text
-    assert int(matches[0]) == attention_fusions
+    assert len(log_matches) == 1, log_holder.text
+    assert int(log_matches[0]) == matches.attention_fusion
 
 
 CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
@@ -187,8 +218,7 @@ def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, "
-    "attention_fusions, allreduce_fusions, custom_ops",
+    "model_name, model_kwargs, backend, matches, custom_ops",
     # Toggle RMSNorm and QuantFP8 for FP8 models
     list(
         flat_product(
@@ -209,8 +239,7 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     model_name: str,
     model_kwargs: dict,
     backend: AttentionBackendEnum,
-    attention_fusions: int,
-    allreduce_fusions: int,
+    matches: Matches,
     custom_ops: str,
     inductor_graph_partition: bool,
     caplog_mp_spawn,
@@ -219,6 +248,13 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
     if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("Inductor graph partition requires torch>=2.9")
 
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
+        # FlashInfer attn fusion requires Blackwell
+        matches = matches._replace(attention_fusion=0)
+
     custom_ops_list = custom_ops.split(",") if custom_ops else []
 
     if inductor_graph_partition:
@@ -258,23 +294,135 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
         run_model(
             compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
         )
-    matches = re.findall(
+    log_matches = re.findall(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.allreduce_fusion
+    assert int(log_matches[1]) == matches.allreduce_fusion
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, model_kwargs, backend, matches, custom_ops",
+    # Toggle RMSNorm and QuantFP8 for FP8 models
+    list(
+        flat_product(
+            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
+        )
+    )
+    # Toggle RMSNorm for FP4 models and unquant models
+    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
+)
+@pytest.mark.parametrize("inductor_graph_partition", [True, False])
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="sequence parallel only tested on CUDA",
+)
+def test_tp2_attn_quant_async_tp(
+    model_name: str,
+    model_kwargs: dict,
+    backend: AttentionBackendEnum,
+    matches: Matches,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    caplog_mp_spawn,
+    monkeypatch,
+):
+    if is_blackwell():
+        # TODO: https://github.com/vllm-project/vllm/issues/27893
+        pytest.skip("Blackwell is not supported for AsyncTP pass")
+
+    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+        pytest.skip("Inductor graph partition requires torch>=2.9")
+
+    if "fp4" in model_name.lower() and not is_blackwell():
+        pytest.skip("NVFP4 quant requires Blackwell")
+
+    if backend == AttentionBackendEnum.FLASHINFER:
+        if not has_flashinfer():
+            pytest.skip("FlashInfer backend requires flashinfer installed")
+        if not is_blackwell():
+            # FlashInfer attn fusion requires Blackwell
+            matches = matches._replace(attention_fusion=0)
+
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
+
+    if inductor_graph_partition:
+        mode = CUDAGraphMode.FULL_AND_PIECEWISE
+        splitting_ops: list[str] | None = None
+    else:
+        mode = CUDAGraphMode.FULL_DECODE_ONLY
+        splitting_ops = []
+
+    # Disable, compile cache to make sure custom passes run.
+    # Otherwise, we can't verify fusion happened through the logs.
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    # To capture subprocess logs, we need to know whether spawn or fork is used.
+    # Force spawn as it is more general.
+    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
+    compilation_config = CompilationConfig(
+        # Testing properties
+        use_inductor_graph_partition=inductor_graph_partition,
+        cudagraph_mode=mode,
+        custom_ops=custom_ops_list,
+        splitting_ops=splitting_ops,
+        # Common
+        level=CompilationMode.VLLM_COMPILE,
+        pass_config=PassConfig(
+            enable_attn_fusion=True,
+            enable_noop=True,
+            enable_sequence_parallelism=True,
+            enable_async_tp=True,
+        ),
+        # Inductor caches custom passes by default as well via uuid
+        inductor_compile_config={"force_disable_caches": True},
+    )
+
+    with caplog_mp_spawn(logging.DEBUG) as log_holder:
+        run_model(
+            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
+        )
+    log_matches = re.findall(
         r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
         log_holder.text,
     )
-    assert len(matches) == 2, log_holder.text
+    assert len(log_matches) == 2, log_holder.text
+
+    assert int(log_matches[0]) == matches.attention_fusion
+    assert int(log_matches[1]) == matches.attention_fusion
+
+    log_matches = re.findall(
+        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2, log_holder.text
 
-    assert int(matches[0]) == attention_fusions
-    assert int(matches[1]) == attention_fusions
+    assert int(log_matches[0]) == matches.sequence_parallel
+    assert int(log_matches[1]) == matches.sequence_parallel
 
-    matches = re.findall(
+    log_matches = re.findall(
         r"collective_fusion.py:\d+] Replaced (\d+) patterns",
         log_holder.text,
     )
-    assert len(matches) == 2, log_holder.text
+    assert len(log_matches) == 2, log_holder.text
 
-    assert int(matches[0]) == allreduce_fusions
-    assert int(matches[1]) == allreduce_fusions
+    assert int(log_matches[0]) == matches.async_tp
+    assert int(log_matches[1]) == matches.async_tp
 
 
 def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py
similarity index 52%
rename from tests/compile/test_sequence_parallelism.py
rename to tests/compile/distributed/test_sequence_parallelism.py
index fc4d38c8f837..723bec0a7547 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/distributed/test_sequence_parallelism.py
@@ -5,15 +5,15 @@
 import torch
 
 import vllm.envs as envs
-from vllm.compilation.fix_functionalization import FixFunctionalizationPass
 from vllm.compilation.fusion import RMSNormQuantFusionPass
-from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
+from vllm.compilation.fx_utils import find_auto_fn
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
 from vllm.compilation.vllm_inductor_pass import VllmInductorPass
 from vllm.config import (
     CompilationConfig,
+    CUDAGraphMode,
     DeviceConfig,
     ModelConfig,
     PassConfig,
@@ -45,169 +45,152 @@
 ]
 
 
-class TestModel(torch.nn.Module):
-    def __init__(self, hidden_size=16, intermediate_size=32):
+class TestAllReduceRMSNormModel(torch.nn.Module):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
         self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size))
-        )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.w = [torch.rand(hidden_size, hidden_size) for _ in range(3)]
 
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
+    def forward(self, x):
+        z = torch.relu(x)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
 
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
+        z2 = torch.mm(y, self.w[0])
+        x2 = tensor_model_parallel_all_reduce(z2)
 
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
+        y2, resid = self.norm[1](x2, resid)
 
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
+        z3 = torch.mm(y2, self.w[1])
+        x3 = tensor_model_parallel_all_reduce(z3)
 
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
+        y3, resid = self.norm[2](x3, resid)
 
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
+        z4 = torch.mm(y3, self.w[2])
+        x4 = tensor_model_parallel_all_reduce(z4)
 
-        return norm_output, residual_output
+        y4, resid = self.norm[3](x4, resid)
+        return y4
 
     def ops_in_model_before(self):
         return [torch.ops.vllm.all_reduce.default]
 
     def ops_in_model_after(self):
         return [
-            torch.ops.vllm.reduce_scatter.default,
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
         ]
 
     def ops_in_model(self):
-        return [torch.ops._C.fused_add_rms_norm.default]
+        if RMSNorm.enabled():
+            return [
+                torch.ops._C.rms_norm.default,
+                torch.ops._C.fused_add_rms_norm.default,
+            ]
+        else:
+            return []
 
 
-class TestQuantModel(torch.nn.Module):
+class TestAllReduceRMSNormStaticQuantFP8Model(torch.nn.Module):
     quant_key = kFp8StaticTensorSym
 
-    def __init__(self, hidden_size=16, intermediate_size=32):
+    def __init__(self, hidden_size=16, eps=1e-6):
         super().__init__()
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
         self.vllm_config = get_current_vllm_config()
-        self.gate_proj = torch.nn.Parameter(
-            torch.empty((intermediate_size, hidden_size)), requires_grad=False
-        )
-        self.norm = RMSNorm(intermediate_size, 1e-05)
-        # Initialize weights
-        torch.nn.init.normal_(self.gate_proj, std=0.02)
-
-        self.scale = torch.rand(1, dtype=torch.float32)
-        # Create a weight that is compatible with torch._scaled_mm,
-        # which expects a column-major layout.
-        self.w = torch.rand(hidden_size, intermediate_size).to(dtype=FP8_DTYPE).t()
-        self.wscale = torch.rand(1, dtype=torch.float32)
-        self.fp8_linear = TestFP8Layer(
-            self.quant_key, self.quant_key, self.w, self.wscale, self.scale
-        )
+        self.hidden_size = hidden_size
+        self.eps = eps
+        self.norm = [RMSNorm(hidden_size, eps) for i in range(4)]
+        self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            for _ in range(3)
+        ]
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(3)]
 
-    def forward(self, hidden_states, residual):
-        """
-        Forward pass implementing the operations in the FX graph
+        self.fp8_linears = [
+            TestFP8Layer(
+                self.quant_key, self.quant_key, self.w[i], self.wscale[i], self.scale[i]
+            )
+            for i in range(3)
+        ]
 
-        Args:
-            hidden_states: Input tensor
-            residual: Residual tensor from previous layer
+    def forward(self, hidden_states):
+        # avoid having graph input be an arg to a pattern directly
+        z = torch.relu(hidden_states)
+        x = resid = tensor_model_parallel_all_reduce(z)
+        y = self.norm[0](x)
 
-        Returns:
-            Tuple containing the output tensor
-        """
-        # Reshape input
-        view = hidden_states.reshape(-1, self.hidden_size)
+        z2 = self.fp8_linears[0](y)
 
-        # matrix multiplication
-        permute = self.gate_proj.permute(1, 0)
-        mm = torch.mm(view, permute)
+        x2 = tensor_model_parallel_all_reduce(z2)
+        y2, resid = self.norm[1](x2, resid)
 
-        # Tensor parallel all-reduce
-        all_reduce = tensor_model_parallel_all_reduce(mm)
+        z3 = self.fp8_linears[1](y2)
 
-        # layer normalization
-        norm_output, residual_output = self.norm(all_reduce, residual)
-        # scaled_mm with static input quantization
-        fp8_linear_result = self.fp8_linear(norm_output)
+        x3 = tensor_model_parallel_all_reduce(z3)
+        y3, resid = self.norm[2](x3, resid)  # use resid here
 
-        return fp8_linear_result, residual_output
-
-    def ops_in_model_before(self):
-        ops_to_remove = [torch.ops.vllm.all_reduce.default]  # Always removed by SP
-        # The following are only removed if fusion happens
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            ops_to_remove.extend(
-                [
-                    torch.ops._C.fused_add_rms_norm.default,
-                    torch.ops._C.static_scaled_fp8_quant.default,
-                ]
-            )
-        return ops_to_remove
+        z4 = self.fp8_linears[2].apply(y3)
+        x4 = tensor_model_parallel_all_reduce(z4)
+        y4, resid = self.norm[3](x4, resid)  # use resid here
+        return y4
 
     def ops_in_model_after(self):
-        ops_to_add = [
-            torch.ops.vllm.reduce_scatter.default,
+        return [
             torch.ops.vllm.all_gather.default,
+            torch.ops.vllm.reduce_scatter.default,
+        ]
+
+    def ops_in_model_before(self):
+        return [
+            torch.ops.vllm.all_reduce.default,
         ]
-        # The following is only added if fusion happens
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            ops_to_add.append(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default)
-        return ops_to_add
 
     def ops_in_model(self):
-        if (
-            self.vllm_config
-            and self.vllm_config.compilation_config.pass_config.enable_fusion
-        ):
-            # If fusion happens, the fused op is the one
-            # we check for (de)functionalization
+        if self.vllm_config.compilation_config.pass_config.enable_fusion:
             return [torch.ops._C.fused_add_rms_norm_static_fp8_quant.default]
-        else:
-            # If no fusion, the original ops are checked
+        elif RMSNorm.enabled():
             return [
                 torch.ops._C.fused_add_rms_norm.default,
-                # TODO  functionalization pass does not handle this yet
-                # torch.ops._C.static_scaled_fp8_quant.default,
             ]
+        elif self.fp8_linear.quant_fp8.enabled():
+            return [
+                torch.ops._C.static_scaled_fp8_quant.default,
+            ]
+        else:
+            return []
 
 
 @multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize("test_model_cls", [TestModel, TestQuantModel])
+@pytest.mark.parametrize(
+    "test_model_cls, custom_ops",
+    [
+        (TestAllReduceRMSNormModel, "+rms_norm"),
+        (TestAllReduceRMSNormModel, "-rms_norm"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "+rms_norm,-quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,+quant_fp8"),
+        (TestAllReduceRMSNormStaticQuantFP8Model, "-rms_norm,-quant_fp8"),
+    ],
+)
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [16])
 @pytest.mark.parametrize("hidden_size", [16])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("enable_fusion", [True, False])
+@pytest.mark.parametrize("dynamic", [False, True])
 @pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
 def test_sequence_parallelism_pass(
     test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
     batch_size: int,
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     num_processes = 2
 
@@ -219,11 +202,13 @@ def run_torch_spawn(fn, nprocs):
             args=(
                 num_processes,
                 test_model_cls,
+                custom_ops,
                 batch_size,
                 seq_len,
                 hidden_size,
                 dtype,
                 enable_fusion,
+                dynamic,
             ),
             nprocs=nprocs,
         )
@@ -235,11 +220,13 @@ def sequence_parallelism_pass_on_test_model(
     local_rank: int,
     world_size: int,
     test_model_cls: type[torch.nn.Module],
+    custom_ops: str,
     batch_size: int,
     seq_len: int,
     hidden_size: int,
     dtype: torch.dtype,
     enable_fusion: bool,
+    dynamic: bool,
 ):
     current_platform.seed_everything(0)
 
@@ -263,12 +250,16 @@ def sequence_parallelism_pass_on_test_model(
     initialize_model_parallel(tensor_model_parallel_size=world_size)
 
     # configure vllm config for SequenceParallelismPass
+    custom_ops_list = custom_ops.split(",") if custom_ops else []
     compilation_config = CompilationConfig(
+        splitting_ops=[],  # avoid automatic rms_norm enablement
+        cudagraph_mode=CUDAGraphMode.NONE,  # avoid piecewise warnings
+        custom_ops=custom_ops_list,
         pass_config=PassConfig(
             enable_sequence_parallelism=True,
             enable_fusion=enable_fusion,
             enable_noop=True,
-        )
+        ),
     )  # NoOp needed for fusion
     device_config = DeviceConfig(device=torch.device("cuda"))
 
@@ -288,7 +279,6 @@ def sequence_parallelism_pass_on_test_model(
     with set_current_vllm_config(vllm_config):
         noop_pass = NoOpEliminationPass(vllm_config)
         sequence_parallelism_pass = SequenceParallelismPass(vllm_config)
-        func_pass = FixFunctionalizationPass(vllm_config)
         cleanup_pass = PostCleanupPass(vllm_config)
         assert (
             sequence_parallelism_pass.compilation_config.splitting_ops
@@ -309,38 +299,29 @@ def sequence_parallelism_pass_on_test_model(
 
         passes_for_backend.append(cleanup_pass)
 
-        backend_no_func = TestBackend(*passes_for_backend)
-        backend_func = TestBackend(*passes_for_backend, func_pass)
+        backend = TestBackend(*passes_for_backend)
 
-        model = test_model_cls(hidden_size, hidden_size * 2)
+        model = test_model_cls(hidden_size)
 
         hidden_states = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
-        residual = torch.randn((batch_size * seq_len, hidden_size), dtype=dtype)
 
-        compiled_model_no_func = torch.compile(model, backend=backend_no_func)
-        compiled_model_no_func(hidden_states, residual)
-        compiled_model_func = torch.compile(model, backend=backend_func)
-        compiled_model_func(hidden_states, residual)
+        if dynamic:
+            torch._dynamo.mark_dynamic(hidden_states, 0)
+
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)
 
-        assert sequence_parallelism_pass.matched_count == 1
+        assert sequence_parallelism_pass.matched_count == 4
 
         # In pre-nodes, all reduce should be there,
         # reduce scatter and all gather should not
-        backend_no_func.check_before_ops(model.ops_in_model_before())
+        for op in model.ops_in_model_before():
+            assert backend.op_count(op, before=True) == 4
 
         # In post-nodes, reduce scatter and all gather should be there,
         # all reduce should not
-        backend_no_func.check_after_ops(model.ops_in_model_after())
+        for op in model.ops_in_model_after():
+            assert backend.op_count(op, before=False) == 4
 
-        # check if the functionalization pass is applied
         for op in model.ops_in_model():
-            find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
-            assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, op) is None
-
-        # make sure the ops were all de-functionalized
-        found = dict()
-        for node in backend_func.graph_post_pass.nodes:
-            for op in model.ops_in_model():
-                if is_func(node, op):
-                    found[op] = True
-        assert all(found[op] for op in model.ops_in_model())
+            find_auto_fn(backend.graph_post_pass.nodes, op)
diff --git a/tests/entrypoints/pooling/correctness/__init__.py b/tests/compile/fullgraph/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/correctness/__init__.py
rename to tests/compile/fullgraph/__init__.py
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/fullgraph/test_basic_correctness.py
similarity index 94%
rename from tests/compile/test_basic_correctness.py
rename to tests/compile/fullgraph/test_basic_correctness.py
index 3f6898607f6b..f2e58b5cc423 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/fullgraph/test_basic_correctness.py
@@ -7,7 +7,7 @@
 from vllm.config import CompilationMode
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
-from ..utils import compare_all_settings
+from ...utils import compare_all_settings
 
 
 @dataclasses.dataclass
@@ -115,7 +115,7 @@ def test_compile_correctness(
             str(pp_size),
             "-tp",
             str(tp_size),
-            "-O.cudagraph_mode=none",
+            "-cc.cudagraph_mode=none",
         ]
 
         all_args: list[list[str]] = []
@@ -128,7 +128,7 @@ def test_compile_correctness(
         ]:
             for mode in [CompilationMode.NONE, comp_mode]:
                 all_args.append(
-                    final_args + [f"-O.mode={mode.name}", "-O.backend=inductor"]
+                    final_args + [f"-cc.mode={mode.name}", "-cc.backend=inductor"]
                 )
 
             # inductor will change the output, so we only compare if the output
@@ -148,7 +148,7 @@ def test_compile_correctness(
             CompilationMode.DYNAMO_TRACE_ONCE,
             CompilationMode.VLLM_COMPILE,
         ]:
-            all_args.append(final_args + [f"-O.mode={mode.name}", "-O.backend=eager"])
+            all_args.append(final_args + [f"-cc.mode={mode.name}", "-cc.backend=eager"])
             all_envs.append({})
             all_envs.append({})
 
diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/fullgraph/test_full_cudagraph.py
similarity index 100%
rename from tests/compile/piecewise/test_full_cudagraph.py
rename to tests/compile/fullgraph/test_full_cudagraph.py
diff --git a/tests/compile/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
similarity index 99%
rename from tests/compile/test_full_graph.py
rename to tests/compile/fullgraph/test_full_graph.py
index b4e5e56ac9fe..2c11ecef7f02 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -15,7 +15,7 @@
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
-from ..utils import create_new_process_for_each_test
+from ...utils import create_new_process_for_each_test
 
 
 def models_list(*, all: bool = True, keywords: list[str] | None = None):
diff --git a/tests/compile/test_multimodal_compile.py b/tests/compile/fullgraph/test_multimodal_compile.py
similarity index 90%
rename from tests/compile/test_multimodal_compile.py
rename to tests/compile/fullgraph/test_multimodal_compile.py
index b76c29819a2d..621f6a51a918 100644
--- a/tests/compile/test_multimodal_compile.py
+++ b/tests/compile/fullgraph/test_multimodal_compile.py
@@ -10,8 +10,8 @@
 
 def test_compile():
     vllm_config = VllmConfig()
-    # Default configuration compiles mm encoder
-    assert vllm_config.compilation_config.compile_mm_encoder
+    # Default configuration does not compile mm encoder
+    assert not vllm_config.compilation_config.compile_mm_encoder
 
 
 # forked needed to workaround https://github.com/vllm-project/vllm/issues/21073
@@ -39,7 +39,10 @@ def test_qwen2_5_vl_compilation(vllm_runner, monkeypatch):
             "Qwen/Qwen2.5-VL-3B-Instruct",
             max_model_len=2048,
             gpu_memory_utilization=0.8,
-            compilation_config={"mode": CompilationMode.VLLM_COMPILE},
+            compilation_config={
+                "mode": CompilationMode.VLLM_COMPILE,
+                "compile_mm_encoder": True,
+            },
         ) as _,
     ):
         pass
diff --git a/tests/compile/piecewise/test_multiple_graphs.py b/tests/compile/fullgraph/test_multiple_graphs.py
similarity index 96%
rename from tests/compile/piecewise/test_multiple_graphs.py
rename to tests/compile/fullgraph/test_multiple_graphs.py
index 64d626bae483..6d3788af9de0 100644
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/fullgraph/test_multiple_graphs.py
@@ -22,6 +22,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -193,7 +195,14 @@ def run_model(
 
 
 @pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
-def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@create_new_process_for_each_test("spawn")
+def test_multi_graph_piecewise_compile(
+    use_inductor_graph_partition: bool, use_bytecode_hook: bool, monkeypatch
+):
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/fullgraph/test_simple.py
similarity index 94%
rename from tests/compile/piecewise/test_simple.py
rename to tests/compile/fullgraph/test_simple.py
index a48af8a8952a..36cc1510ed79 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/fullgraph/test_simple.py
@@ -21,6 +21,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from ..silly_attention import get_global_counter, reset_global_counter
 
@@ -53,7 +55,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 def _run_simple_model(
     splitting_ops,
     use_inductor_graph_partition,
-    use_inductor,
+    backend,
     expected_num_piecewise_graphs_seen,
     expected_num_piecewise_capturable_graphs_seen,
     expected_num_backend_compilations,
@@ -62,7 +64,7 @@ def _run_simple_model(
     vllm_config = VllmConfig(
         compilation_config=CompilationConfig(
             mode=CompilationMode.VLLM_COMPILE,
-            use_inductor=use_inductor,
+            backend=backend,
             splitting_ops=splitting_ops,
             use_inductor_graph_partition=use_inductor_graph_partition,
             cudagraph_copy_inputs=True,
@@ -122,13 +124,14 @@ def _run_simple_model(
         assert torch.allclose(output.cpu(), torch.tensor([19.0, 19.0]))
 
 
-@pytest.mark.parametrize("use_inductor", [True, False])
+@pytest.mark.parametrize("backend", ["inductor", "eager"])
 @torch.inference_mode()
-def test_simple_piecewise_compile(use_inductor):
+@create_new_process_for_each_test("spawn")
+def test_simple_piecewise_compile(backend):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=False,
-        use_inductor=use_inductor,
+        backend=backend,
         # 2 * num_layers + 1
         expected_num_piecewise_graphs_seen=5,
         # 1 + num_layers
@@ -152,7 +155,7 @@ def test_simple_inductor_graph_partition(monkeypatch):
     _run_simple_model(
         splitting_ops=["silly::attention"],
         use_inductor_graph_partition=True,
-        use_inductor=True,
+        backend="inductor",
         # Since not splitting at fx graph level
         expected_num_piecewise_graphs_seen=1,
         # Since not splitting at fx graph level
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/fullgraph/test_toy_llama.py
similarity index 98%
rename from tests/compile/piecewise/test_toy_llama.py
rename to tests/compile/fullgraph/test_toy_llama.py
index 92998ede1699..915fbc6ce7f3 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/fullgraph/test_toy_llama.py
@@ -29,6 +29,8 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
+from ...utils import create_new_process_for_each_test
+
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention  # noqa: F401
 
@@ -334,6 +336,7 @@ def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
         ("inductor", True),  # Inductor, Inductor partition
     ],
 )
+@create_new_process_for_each_test("spawn")
 def test_toy_llama(
     backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
 ):
@@ -513,4 +516,8 @@ def benchmark():
 
 
 if __name__ == "__main__":
-    benchmark()
+    # Protect against subprocess reimport when using spawn_new_process_for_each_test
+    import os
+
+    if os.environ.get("RUNNING_IN_SUBPROCESS") != "1":
+        benchmark()
diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py
index bb66ef5529b1..a9e5ccee520e 100644
--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -15,6 +15,9 @@
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import _is_torch_equal_or_newer
 
+# This import automatically registers `torch.ops.silly.attention`
+from . import silly_attention  # noqa: F401
+
 
 def test_version():
     # Test the version comparison logic using the private function
@@ -169,8 +172,8 @@ def test_splitting_ops_dynamic():
     config = VllmConfig()
     # Default V1 config leaves cudagraph mode unset; splitting ops are only
     # populated when the engine decides to use piecewise compilation.
-    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-    assert not config.compilation_config.splitting_ops_contain_attention()
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    assert config.compilation_config.splitting_ops_contain_attention()
 
     # When use_inductor_graph_partition=True
     config = VllmConfig(
@@ -257,15 +260,6 @@ def test_should_split():
     splitting_ops = ["aten::add.Tensor"]
     assert not should_split(node, splitting_ops)
 
-    @torch.library.custom_op(
-        "silly::attention",
-        mutates_args=["out"],
-    )
-    def attention(
-        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
-    ) -> None:
-        out.copy_(q + k + v)
-
     q, k, v, out = [torch.randn(1)] * 4
 
     # supports custom ops as OpOverloadPacket
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
new file mode 100644
index 000000000000..c20aea822fe8
--- /dev/null
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import gc
+
+import pytest
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config.compilation import CompilationMode, DynamicShapesType
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.torch_utils import is_torch_equal_or_newer
+
+
+def get_test_models():
+    """Get list of models to test based on PyTorch version"""
+    # TODO "Qwen/Qwen3-4B-Instruct-2507" fails Fix issue and support it.
+    return ["gpt2", "Qwen/Qwen2-7B-Instruct", "meta-llama/Llama-3.1-8B"]
+
+
+@pytest.mark.parametrize("model_name", get_test_models())
+@pytest.mark.parametrize(
+    "shapes_type",
+    [
+        DynamicShapesType.BACKED,
+        DynamicShapesType.UNBACKED,
+        DynamicShapesType.BACKED_SIZE_OBLIVIOUS,
+    ],
+)
+@pytest.mark.parametrize("use_aot_compile", ["0"])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+@pytest.mark.skipif(
+    not is_torch_equal_or_newer("2.10.0.dev"), reason="requires torch 2.10"
+)
+def test_dynamic_shapes_compilation(
+    monkeypatch, model_name, shapes_type, use_aot_compile, use_bytecode_hook
+):
+    """Test that all dynamic shapes types compile successfully"""
+    print(
+        f"\nTesting model: {model_name} with {shapes_type.name}, "
+        f"AOT compile: {use_aot_compile}, "
+        f"Bytecode hook: {use_bytecode_hook}"
+    )
+    if use_bytecode_hook and shapes_type == DynamicShapesType.UNBACKED:
+        pytest.skip("UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0")
+
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", use_aot_compile)
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
+
+    prompt = "Hello, my name is"
+
+    print(f"Testing {shapes_type.name} dynamic shapes...")
+
+    # Initialize the model with specific dynamic shapes configuration
+    model = LLM(
+        model=model_name,
+        compilation_config={
+            "mode": CompilationMode.VLLM_COMPILE,
+            "dynamic_shapes_config": {
+                "type": shapes_type.value,
+            },
+        },
+    )
+
+    output = model.generate(prompt)
+    result = output[0].outputs[0].text
+    # Example of setting the sampling parameters
+    tokenizer = get_tokenizer(model_name)
+    yes_tokens = tokenizer.encode("yes", add_special_tokens=False)
+    no_tokens = tokenizer.encode("no", add_special_tokens=False)
+    allowed_ids = list(set(yes_tokens + no_tokens))
+    sampling_params = SamplingParams(
+        max_tokens=1, temperature=0, allowed_token_ids=allowed_ids
+    )
+
+    output = model.generate(
+        "answer with yes or no is " + result + " rubbish for prompt " + prompt + "?",
+        sampling_params=sampling_params,
+    )
+    result = output[0].outputs[0].text
+    assert result == "yes"
+
+    # Clean up GPU memory
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.synchronize()
+    print("GPU memory cleared")
diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
index ef8ad92a923e..0175029d7293 100644
--- a/tests/compile/test_functionalization.py
+++ b/tests/compile/test_functionalization.py
@@ -147,7 +147,7 @@ def __init__(self, head_dim=64, rotary_dim=None, max_position=2048, base=10000):
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, q, k):
@@ -182,7 +182,7 @@ def __init__(self, head_dim=64, num_heads=4, max_position=2048, base=10000):
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=base,
+            rope_parameters={"rope_type": "default", "rope_theta": base},
         )
 
     def forward(self, positions, hidden_states):
diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py
index ec079f11d2cd..8651a4273d07 100644
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -9,8 +9,9 @@
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import Attention
 from vllm.attention.selector import global_force_attn_backend_context_manager
 from vllm.compilation.fusion_attn import ATTN_OP, AttnFusionPass
 from vllm.compilation.fx_utils import find_op_nodes
diff --git a/tests/compile/test_graph_partition.py b/tests/compile/test_graph_partition.py
new file mode 100644
index 000000000000..1cd783843a62
--- /dev/null
+++ b/tests/compile/test_graph_partition.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import operator
+
+import pytest
+import torch
+from torch.fx.experimental.proxy_tensor import make_fx
+
+from vllm.compilation.backends import split_graph
+
+
+def test_getitem_moved_to_producer_subgraph():
+    """
+    Test that getitem operations are moved to the same subgraph as their input,
+    preventing tuple inputs to submodules.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # Following ops should become second submodule that consumes tuple
+        result_0 = torch.relu(chunks[0])
+        result_1 = torch.relu(chunks[1])
+        return torch.cat([result_0, result_1], dim=0)
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    # Split on tuple producer aten::split
+    split_ops = ["aten::split.Tensor"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 2, "Graph should be split into 2 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        getitem_on_placeholder = []
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                getitem_on_placeholder.append(node)
+
+        assert len(getitem_on_placeholder) == 0, (
+            f"Submodule {split_item.submod_name} has getitem operations on "
+            f"placeholder nodes: {[n.name for n in getitem_on_placeholder]}. "
+            "This means tuple inputs were not properly eliminated."
+        )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch"
+
+
+def test_no_tuple_inputs_with_multiple_consumers():
+    """
+    Test that when a tuple is consumed by multiple split operations,
+    getitem operations are properly moved to avoid tuple inputs.
+    """
+
+    def model_fn(x: torch.Tensor) -> torch.Tensor:
+        # torch.split returns a tuple, creating real getitem operations
+        # Should become first submodule that produces tuple
+        chunks = torch.split(x, x.shape[0] // 2, dim=0)
+
+        # These should become second submodule consuming tuple
+        result_1 = torch.relu(chunks[0])
+        result_2 = torch.relu(chunks[1])
+
+        # Artificial graph splitting point to create another
+        # independent submodule that consumes tuple later
+        # This would become the third submodule
+        result_1 = torch.sigmoid(result_1)
+
+        # Fourth submodule that consumes tuple
+        result = torch.cat([chunks[0], chunks[1], result_1, result_2])
+        return result
+
+    x = torch.randn(4, 3)
+    gm = make_fx(model_fn)(x)
+
+    has_getitem = any(
+        node.op == "call_function" and node.target == operator.getitem
+        for node in gm.graph.nodes
+    )
+    assert has_getitem, "Test setup failed: graph should contain getitem operations"
+
+    split_ops = ["aten::split.Tensor", "aten::sigmoid"]
+    split_gm, split_items = split_graph(gm, split_ops)
+    assert len(split_items) == 4, "Graph should be split into 4 submodules"
+
+    for split_item in split_items:
+        submodule = split_item.graph
+
+        for node in submodule.graph.nodes:
+            if (
+                node.op == "call_function"
+                and node.target == operator.getitem
+                and node.args[0].op == "placeholder"
+            ):
+                pytest.fail(
+                    f"Submodule {split_item.submod_name} has getitem on "
+                    f"placeholder {node.args[0].name}, indicating it receives "
+                    "a tuple input"
+                )
+
+    new_x = torch.randn(4, 3)
+    output_original = gm(new_x)
+    output_split = split_gm(new_x)
+
+    assert torch.allclose(output_original, output_split), "Output mismatch after split"
diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py
index 511e50f5fdc2..5ebb95b6db33 100644
--- a/tests/compile/test_qk_norm_rope_fusion.py
+++ b/tests/compile/test_qk_norm_rope_fusion.py
@@ -5,7 +5,8 @@
 import torch
 
 from tests.compile.backend import TestBackend
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.matcher_utils import FLASHINFER_ROTARY_OP, RMS_OP, ROTARY_OP
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.compilation.post_cleanup import PostCleanupPass
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index da0afd9eaa49..356cac7af258 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -2,59 +2,134 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import os
+
+import pytest
 import torch
 
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import CompilationMode
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
+from vllm.config import (
+    CompilationConfig,
+    CompilationMode,
+    VllmConfig,
+    set_current_vllm_config,
+)
 
 
 class MyMod(torch.nn.Module):
     def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        if cache is not None:
-            return x + cache
-        return x * 2
+        if x.size()[0] >= 4:
+            return x * 2
+        else:
+            return x * 100
 
 
-class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
+class MyWrapper(TorchCompileWithNoGuardsWrapper):
     def __init__(self, model):
         self.model = model
-        compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(
-            compiled_callable, compilation_mode=CompilationMode.DYNAMO_TRACE_ONCE
-        )
+        super().__init__()
 
-    def forward(self, x: torch.Tensor, cache: torch.Tensor | None = None):
+    def forward(self, x: torch.Tensor):  # type: ignore[override]
         # this is the function to be compiled
-        return self.model(x, cache)
-
-    def __call__(self, x: torch.Tensor, cache: torch.Tensor | None = None):
-        # let torch.compile compile twice
-        if len(self.compiled_codes) == 2:
-            dispatch_id = 0 if cache is None else 1
-            with self.dispatch_to_code(dispatch_id):
-                return self.forward(x, cache)
-        else:
-            return self.compiled_callable(x, cache)
+        return self.model(x)
+
 
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
+def test_torch_compile_wrapper(use_bytecode_hook, monkeypatch):
+    """Test basic functionality of TorchCompileWithNoGuardsWrapper."""
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
-def test_torch_compile_wrapper():
-    mod = MyMod()
-    wrappers = []
-    for i in range(3):
+    # Create a proper vLLM config instead of mocking
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig()
+    vllm_config.compilation_config.mode = CompilationMode.DYNAMO_TRACE_ONCE
+    vllm_config.compilation_config.backend = "inductor"
+
+    # Test DYNAMO_TRACE_ONCE
+    with set_current_vllm_config(vllm_config):
         torch._dynamo.reset()
+        mod = MyMod()
+        wrapper = MyWrapper(mod)
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should use compiled code
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([2, 4, 6])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+        # without the wrapper result would be different.
+        result3 = mod(x2)
+        expected3 = torch.tensor([100, 200, 300])
+
+        assert torch.allclose(result3, expected3), (
+            f"Expected {result3}, got {expected3}"
+        )
+
+    # with STOCK_TORCH_COMPILE we do not remove guards.
+    vllm_config.compilation_config.mode = CompilationMode.STOCK_TORCH_COMPILE
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        mod = MyMod()
         wrapper = MyWrapper(mod)
-        wrappers.append(wrapper)
-        x = torch.tensor([1])
-        wrapper(x, None)  # profile run, compile
-        # create a cache tensor
-        cache = torch.tensor([2])
-        wrapper(x, cache)  # warm up with cache, recompile
-
-        # for new input, dispatch to the compiled code directly
-        new_x = torch.tensor([3])
-        assert wrapper(new_x, None).item() == 6  # dispatch to the first compiled code
-        assert wrapper(new_x, cache).item() == 5  # dispatch to the second compiled code
-
-    for wrapper in wrappers:
-        # make sure they have independent compiled codes
-        assert len(wrapper.compiled_codes) == 2
+
+        # First call should trigger compilation
+        x = torch.tensor([1, 2, 3, 4])
+        torch._dynamo.mark_dynamic(x, 0)
+
+        result1 = wrapper(x)
+        expected1 = torch.tensor([2, 4, 6, 8])
+        assert torch.allclose(result1, expected1), (
+            f"Expected {expected1}, got {result1}"
+        )
+
+        # Second call should triger another compilation
+        x2 = torch.tensor([1, 2, 3])
+        result2 = wrapper(x2)
+        expected2 = torch.tensor([100, 200, 300])
+        assert torch.allclose(result2, expected2), (
+            f"Expected {expected2}, got {result2}"
+        )
+
+    # NO_COMPILATION level not supported.
+    vllm_config.compilation_config.mode = None
+    torch._dynamo.reset()
+    with set_current_vllm_config(vllm_config):
+        torch._dynamo.reset()
+        mod = MyMod()
+
+        try:
+            wrapper = MyWrapper(mod)
+        except Exception:
+            return
+        raise AssertionError("expected an exception to be raised")
+
+
+if __name__ == "__main__":
+    # Run with both parameter values
+
+    class MockMonkeypatch:
+        def setenv(self, name, value):
+            os.environ[name] = value
+
+    mp = MockMonkeypatch()
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=False")
+    test_torch_compile_wrapper(False, mp)
+
+    print("Testing with VLLM_USE_BYTECODE_HOOK=True")
+    test_torch_compile_wrapper(True, mp)
+
+    print("All tests passed!")
diff --git a/tests/config/test_config_utils.py b/tests/config/test_config_utils.py
new file mode 100644
index 000000000000..1277c7e64eb2
--- /dev/null
+++ b/tests/config/test_config_utils.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from enum import Enum
+
+import pytest
+
+from vllm.config.utils import get_hash_factors, hash_factors, normalize_value
+
+# Helpers
+
+
+def endswith_fqname(obj, suffix: str) -> bool:
+    # normalize_value(type) returns fully-qualified name
+    # Compare suffix to avoid brittle import paths.
+    out = normalize_value(obj)
+    return isinstance(out, str) and out.endswith(suffix)
+
+
+def expected_path(p_str: str = ".") -> str:
+    import pathlib
+
+    p = pathlib.Path(p_str)
+    return p.expanduser().resolve().as_posix()
+
+
+# Minimal dataclass to test get_hash_factors.
+# Avoid importing heavy vLLM configs.
+@dataclass
+class SimpleConfig:
+    a: object
+    b: object | None = None
+
+
+class DummyLogprobsMode(Enum):
+    RAW_LOGITS = "raw_logits"
+
+
+def test_hash_factors_deterministic():
+    """Test that hash_factors produces consistent SHA-256 hashes"""
+    factors = {"a": 1, "b": "test"}
+    hash1 = hash_factors(factors)
+    hash2 = hash_factors(factors)
+
+    assert hash1 == hash2
+    # Dict key insertion order should not affect the hash.
+    factors_reordered = {"b": "test", "a": 1}
+    assert hash_factors(factors_reordered) == hash1
+    assert len(hash1) == 64
+    assert all(c in "0123456789abcdef" for c in hash1)
+
+
+@pytest.mark.parametrize(
+    "inp, expected",
+    [
+        (None, None),
+        (True, True),
+        (1, 1),
+        (1.0, 1.0),
+        ("x", "x"),
+        (b"ab", "6162"),
+        (bytearray(b"ab"), "6162"),
+        ([1, 2], (1, 2)),
+        ({"b": 2, "a": 1}, (("a", 1), ("b", 2))),
+    ],
+)
+def test_normalize_value_matrix(inp, expected):
+    """Parametric input→expected normalization table."""
+    assert normalize_value(inp) == expected
+
+
+def test_normalize_value_enum():
+    # Enums normalize to (module.QualName, value).
+    # DummyLogprobsMode uses a string payload.
+    out = normalize_value(DummyLogprobsMode.RAW_LOGITS)
+    assert isinstance(out, tuple)
+    assert out[0].endswith("DummyLogprobsMode")
+    # Expect string payload 'raw_logits'.
+    assert out[1] == "raw_logits"
+
+
+def test_normalize_value_set_order_insensitive():
+    # Sets are unordered; normalize_value sorts elements for determinism.
+    assert normalize_value({3, 1, 2}) == normalize_value({1, 2, 3})
+
+
+def test_normalize_value_path_normalization():
+    from pathlib import Path  # local import to avoid global dependency
+
+    # Paths expand/resolve to absolute strings.
+    # Stabilizes hashing across working dirs.
+    assert normalize_value(Path(".")) == expected_path(".")
+
+
+def test_normalize_value_uuid_and_to_json():
+    # Objects may normalize via uuid() or to_json_string().
+    class HasUUID:
+        def uuid(self):
+            return "test-uuid"
+
+    class ToJson:
+        def to_json_string(self):
+            return '{"x":1}'
+
+    assert normalize_value(HasUUID()) == "test-uuid"
+    assert normalize_value(ToJson()) == '{"x":1}'
+
+
+@pytest.mark.parametrize(
+    "bad",
+    [
+        (lambda x: x),
+        (type("CallableInstance", (), {"__call__": lambda self: 0}))(),
+        (lambda: (lambda: 0))(),  # nested function instance
+    ],
+)
+def test_error_cases(bad):
+    """Inputs expected to raise TypeError."""
+    # Reject functions/lambdas/callable instances
+    # to avoid under-hashing.
+    with pytest.raises(TypeError):
+        normalize_value(bad)
+
+
+def test_enum_vs_int_disambiguation():
+    # int stays primitive
+    nf_int = normalize_value(1)
+    assert nf_int == 1
+
+    # enum becomes ("module.QualName", value)
+    nf_enum = normalize_value(DummyLogprobsMode.RAW_LOGITS)
+    assert isinstance(nf_enum, tuple) and len(nf_enum) == 2
+    enum_type, enum_val = nf_enum
+    assert enum_type.endswith(".DummyLogprobsMode")
+    assert enum_val == "raw_logits"
+
+    # Build factor dicts from configs with int vs enum
+    f_int = get_hash_factors(SimpleConfig(1), set())
+    f_enum = get_hash_factors(SimpleConfig(DummyLogprobsMode.RAW_LOGITS), set())
+    # The int case remains a primitive value
+    assert f_int["a"] == 1
+    # The enum case becomes a tagged tuple ("module.QualName", "raw_logits")
+    assert isinstance(f_enum["a"], tuple) and f_enum["a"][1] == "raw_logits"
+    # Factor dicts must differ so we don't collide primitives with Enums.
+    assert f_int != f_enum
+    # Hash digests must differ correspondingly
+    assert hash_factors(f_int) != hash_factors(f_enum)
+
+    # Hash functions produce stable hex strings
+    h_int = hash_factors(f_int)
+    h_enum = hash_factors(f_enum)
+    assert isinstance(h_int, str) and len(h_int) == 64
+    assert isinstance(h_enum, str) and len(h_enum) == 64
+
+
+def test_classes_are_types():
+    """Types normalize to FQNs; include real vLLM types."""
+    # Only classes allowed; functions/lambdas are rejected.
+    # Canonical form is the fully-qualified name.
+    assert isinstance(normalize_value(str), str)
+
+    class LocalDummy:
+        pass
+
+    assert endswith_fqname(LocalDummy, ".LocalDummy")
diff --git a/tests/conftest.py b/tests/conftest.py
index 5e127e4e939e..317b36ba6cb8 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -459,14 +459,17 @@ def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
             embeddings.append(embedding)
         return embeddings
 
-    def classify(self, prompts: list[str]) -> list[str]:
+    def classify(self, prompts: list[str]) -> list[list[float]]:
         # output is final logits
         all_inputs = self.get_inputs(prompts)
-        outputs = []
+        outputs: list[list[float]] = []
         problem_type = getattr(self.config, "problem_type", "")
 
         for inputs in all_inputs:
             output = self.model(**self.wrap_device(inputs))
+
+            assert isinstance(output.logits, torch.Tensor)
+
             if problem_type == "regression":
                 logits = output.logits[0].tolist()
             elif problem_type == "multi_label_classification":
@@ -748,6 +751,14 @@ def __init__(
             # being captured which can trigger edge cases that we don't handle yet.
             kwargs["compilation_config"] = {"cudagraph_capture_sizes": [4]}
 
+            # Make sure we have atleast one cudagraph large enough for a single decode.
+            if (speculative_config := kwargs.get("speculative_config")) and (
+                num_speculative_tokens := speculative_config["num_speculative_tokens"]
+            ):
+                kwargs["compilation_config"]["cudagraph_capture_sizes"].append(
+                    num_speculative_tokens + 1
+                )
+
         with init_ctx:
             self.llm = LLM(
                 model=model_name,
@@ -845,6 +856,7 @@ def generate(
     @staticmethod
     def _final_steps_generate_w_logprobs(
         req_outputs: list[RequestOutput],
+        include_prompt_token_ids: bool = False,
     ) -> list[TokensTextLogprobsPromptLogprobs]:
         outputs: list[TokensTextLogprobsPromptLogprobs] = []
         for req_output in req_outputs:
@@ -853,9 +865,26 @@ def _final_steps_generate_w_logprobs(
                 output_str = sample.text
                 output_ids = list(sample.token_ids)
                 output_logprobs = sample.logprobs
-            outputs.append(
-                (output_ids, output_str, output_logprobs, req_output.prompt_logprobs)
-            )
+            if include_prompt_token_ids:
+                outputs.append(
+                    (  # type: ignore[arg-type]
+                        output_ids,
+                        output_str,
+                        output_logprobs,
+                        req_output.prompt_token_ids,
+                        req_output.prompt_logprobs,
+                    )
+                )
+            else:
+                outputs.append(
+                    (
+                        output_ids,
+                        output_str,
+                        output_logprobs,
+                        req_output.prompt_logprobs,
+                    )
+                )
+
         return outputs
 
     def generate_w_logprobs(
@@ -865,6 +894,7 @@ def generate_w_logprobs(
         images: PromptImageInput | None = None,
         audios: PromptAudioInput | None = None,
         videos: PromptVideoInput | None = None,
+        include_prompt_token_ids: bool = False,
         **kwargs: Any,
     ) -> list[TokensTextLogprobs] | list[TokensTextLogprobsPromptLogprobs]:
         inputs = self.get_inputs(prompts, images=images, videos=videos, audios=audios)
@@ -874,7 +904,7 @@ def generate_w_logprobs(
         )
 
         toks_str_logsprobs_prompt_logprobs = self._final_steps_generate_w_logprobs(
-            req_outputs
+            req_outputs, include_prompt_token_ids
         )
         # Omit prompt logprobs if not required by sampling params
         return (
@@ -1384,3 +1414,45 @@ def image_urls(request, local_asset_server) -> list[str]:
     """Indirect fixture: takes a list of names, returns list of full URLs."""
     names: list[str] = request.param
     return [local_asset_server.url_for(name) for name in names]
+
+
+@pytest.fixture
+def disable_deepgemm_ue8m0(monkeypatch):
+    from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
+
+    with monkeypatch.context() as monkeypatch_ctx:
+        monkeypatch_ctx.setenv("VLLM_USE_DEEP_GEMM_E8M0", "0")
+        is_deep_gemm_e8m0_used.cache_clear()
+        yield
+        # Clear cache so the next time it is used it is processed with the
+        # default VLLM_USE_DEEP_GEMM_E8M0  setting.
+        is_deep_gemm_e8m0_used.cache_clear()
+
+
+@pytest.fixture(autouse=True)
+def clean_gpu_memory_between_tests():
+    if os.getenv("VLLM_TEST_CLEAN_GPU_MEMORY", "0") != "1":
+        yield
+        return
+
+    # Wait for GPU memory to be cleared before starting the test
+    import gc
+
+    from tests.utils import wait_for_gpu_memory_to_clear
+
+    num_gpus = torch.cuda.device_count()
+    if num_gpus > 0:
+        try:
+            wait_for_gpu_memory_to_clear(
+                devices=list(range(num_gpus)),
+                threshold_ratio=0.1,
+            )
+        except ValueError as e:
+            logger.info("Failed to clean GPU memory: %s", e)
+
+    yield
+
+    # Clean up GPU memory after the test
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()
diff --git a/tests/distributed/eplb_utils.py b/tests/distributed/eplb_utils.py
new file mode 100644
index 000000000000..27a63e021514
--- /dev/null
+++ b/tests/distributed/eplb_utils.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import os
+import random
+
+import torch
+import torch.multiprocessing as mp
+
+from vllm.distributed.parallel_state import (
+    init_distributed_environment,
+)
+from vllm.utils.system_utils import update_environment_variables
+
+mp.set_start_method("spawn", force=True)
+
+
+def distributed_run(fn, world_size, *args):
+    number_of_processes = world_size
+    processes: list[mp.Process] = []
+    for i in range(number_of_processes):
+        env: dict[str, str] = {}
+        env["RANK"] = str(i)
+        env["LOCAL_RANK"] = str(i)
+        env["WORLD_SIZE"] = str(number_of_processes)
+        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
+        env["MASTER_ADDR"] = "localhost"
+        env["MASTER_PORT"] = "12345"
+        p = mp.Process(target=fn, args=(env, world_size, *args))
+        processes.append(p)
+        p.start()
+
+    for p in processes:
+        p.join()
+
+    for p in processes:
+        assert p.exitcode == 0
+
+
+def set_env_vars_and_device(env: dict[str, str]) -> None:
+    update_environment_variables(env)
+    local_rank = os.environ["LOCAL_RANK"]
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    init_distributed_environment()
+
+    # Ensure each worker process has the same random seed
+    random.seed(42)
+    torch.manual_seed(42)
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
index 3576efca591c..7e4713b8aece 100644
--- a/tests/distributed/test_context_parallel.py
+++ b/tests/distributed/test_context_parallel.py
@@ -31,7 +31,7 @@ class ParallelSetup(NamedTuple):
     tp_size: int
     pp_size: int
     dcp_size: int
-    dcp_kv_cache_interleave_size: int
+    cp_kv_cache_interleave_size: int
     eager_mode: bool
     chunked_prefill: bool
 
@@ -39,6 +39,7 @@ class ParallelSetup(NamedTuple):
 class CPTestOptions(NamedTuple):
     multi_node_only: bool
     load_format: str | None = None
+    attn_backend: str | None = None
 
 
 @dataclass
@@ -54,10 +55,11 @@ def detailed(
         tp_base: int = 4,
         pp_base: int = 1,
         dcp_base: int = 1,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
         multi_node_only: bool = False,
         runner: RunnerOption = "auto",
         load_format: str | None = None,
+        attn_backend: str | None = None,
     ):
         parallel_setups = []
         for eager_mode_val in [False]:
@@ -69,7 +71,7 @@ def detailed(
                                 tp_size=tp_base,
                                 pp_size=pp_multiplier * pp_base,
                                 dcp_size=int(dcp_multiplier * tp_base),
-                                dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+                                cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
                                 eager_mode=eager_mode_val,
                                 chunked_prefill=chunked_prefill_val,
                             )
@@ -79,7 +81,9 @@ def detailed(
             distributed_backends=["mp"],
             runner=runner,
             test_options=CPTestOptions(
-                multi_node_only=multi_node_only, load_format=load_format
+                multi_node_only=multi_node_only,
+                load_format=load_format,
+                attn_backend=attn_backend,
             ),
         )
 
@@ -112,12 +116,12 @@ def _compare_cp_with_tp(
         tp_size,
         pp_size,
         dcp_size,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
         eager_mode,
         chunked_prefill,
     ) = parallel_setup
 
-    multi_node_only, load_format = test_options
+    multi_node_only, load_format, attn_backend = test_options
 
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_transformers_version(on_fail="skip")
@@ -177,6 +181,13 @@ def _compare_cp_with_tp(
     if hf_overrides:
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
+    if not attn_backend:
+        cp_env = tp_env = {}
+    else:
+        cp_env = tp_env = {
+            "VLLM_ATTENTION_BACKEND": attn_backend,
+        }
+
     cp_args = [
         *common_args,
         "--tensor-parallel-size",
@@ -186,7 +197,7 @@ def _compare_cp_with_tp(
         "--decode-context-parallel-size",
         str(dcp_size),
         "--dcp-kv-cache-interleave-size",
-        str(dcp_kv_cache_interleave_size),
+        str(cp_kv_cache_interleave_size),
         "--distributed-executor-backend",
         distributed_backend,
     ]
@@ -205,6 +216,8 @@ def _compare_cp_with_tp(
         model_id,
         cp_args,
         tp_args,
+        cp_env,
+        tp_env,
         method=method,
         max_wait_seconds=720,
     )
@@ -214,7 +227,7 @@ def _compare_cp_with_tp(
     "deepseek-ai/DeepSeek-V2-Lite-Chat": [
         CPTestSettings.detailed(),
         CPTestSettings.detailed(tp_base=2),
-        CPTestSettings.detailed(tp_base=2, dcp_kv_cache_interleave_size=64),
+        CPTestSettings.detailed(tp_base=2, cp_kv_cache_interleave_size=64),
     ],
     "bigcode/gpt_bigcode-santacoder": [
         CPTestSettings.detailed(),
diff --git a/tests/distributed/test_eplb_execute.py b/tests/distributed/test_eplb_execute.py
index 7b45ae82c72d..781dfd44c1ef 100644
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -1,63 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import multiprocessing
-import os
+import asyncio
 import random
 
 import pytest
 import torch
 import torch.distributed
 
-from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.eplb.rebalance_execute import (
+    move_from_buffer,
+    rearrange_expert_weights_inplace,
+    transfer_layer,
+)
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
     get_tp_group,
-    init_distributed_environment,
 )
-from vllm.utils.system_utils import update_environment_variables
-
-
-def distributed_run(fn, world_size):
-    number_of_processes = world_size
-    processes: list[multiprocessing.Process] = []
-    for i in range(number_of_processes):
-        env: dict[str, str] = {}
-        env["RANK"] = str(i)
-        env["LOCAL_RANK"] = str(i)
-        env["WORLD_SIZE"] = str(number_of_processes)
-        env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
-        env["MASTER_ADDR"] = "localhost"
-        env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env,))
-        processes.append(p)
-        p.start()
-
-    for p in processes:
-        p.join()
-
-    for p in processes:
-        assert p.exitcode == 0
-
 
-def worker_fn_wrapper(fn):
-    # `multiprocessing.Process` cannot accept environment variables directly
-    # so we need to pass the environment variables as arguments
-    # and update the environment variables in the function
-    def wrapped_fn(env):
-        update_environment_variables(env)
-        local_rank = os.environ["LOCAL_RANK"]
-        device = torch.device(f"cuda:{local_rank}")
-        torch.cuda.set_device(device)
-        init_distributed_environment()
-
-        # Ensure each worker process has the same random seed
-        random.seed(42)
-        torch.manual_seed(42)
-
-        fn()
-
-    return wrapped_fn
+from .eplb_utils import distributed_run, set_env_vars_and_device
 
 
 def create_expert_indices_with_redundancy(
@@ -275,6 +236,173 @@ def verify_redundant_experts_have_same_weights(
                     )
 
 
+def _test_async_transfer_layer_without_mtp_worker(
+    env,
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    tp_group = get_tp_group()
+    ep_group = tp_group.device_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    total_physical_experts = world_size * num_local_experts
+    hidden_sizes = [16, 32]
+
+    redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        redundancy_config,
+    )
+
+    new_redundancy_config = create_redundancy_config(
+        num_logical_experts,
+        total_physical_experts,
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        new_redundancy_config,
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers,
+        num_local_experts,
+        hidden_sizes,
+        ep_rank,
+        device,
+        old_indices,
+    )
+
+    expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
+    cuda_stream = torch.cuda.Stream(device=device)
+
+    for layer_idx in range(num_layers):
+        is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
+            transfer_layer(
+                old_global_expert_indices=old_indices,
+                new_global_expert_indices=new_indices,
+                expert_weights=expert_weights,
+                expert_weights_buffer=expert_buffer,
+                ep_group=ep_group,
+                layer=layer_idx,
+                cuda_stream=cuda_stream,
+            )
+        )
+
+        cuda_stream.synchronize()
+        move_from_buffer(
+            expert_weights=expert_weights[layer_idx],
+            expert_weights_buffer=expert_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_indices[layer_idx].tolist(),
+            ep_group=ep_group,
+        )
+
+    verify_expert_weights_after_shuffle(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        ep_rank,
+        num_local_experts,
+    )
+    verify_redundant_experts_have_same_weights(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        world_size,
+        num_local_experts,
+    )
+
+
+def _test_rearrange_expert_weights_with_redundancy(
+    env, world_size, num_layers, num_local_experts, num_logical_experts
+) -> None:
+    # Initialize model parallel (using tensor parallel as an entrypoint
+    # to expert parallel)
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    # Test parameters
+    total_physical_experts = world_size * num_local_experts
+    hidden_sizes = [32, 64]  # Two different weight matrices
+
+    # Create old expert indices (with redundancy)
+    redundancy_config = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        redundancy_config,
+    )
+
+    # Create new expert indices (with redundancy)
+    new_redundancy_config = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers,
+        num_logical_experts,
+        total_physical_experts,
+        new_redundancy_config,
+    )
+
+    # Create expert weights
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+    )
+
+    # Execute weight rearrangement
+    rearrange_expert_weights_inplace(
+        old_indices,
+        new_indices,
+        expert_weights,
+        ep_group,
+        is_profile=False,
+    )
+
+    # Verify the rearrangement result
+    verify_expert_weights_after_shuffle(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        ep_rank,
+        num_local_experts,
+    )
+
+    verify_redundant_experts_have_same_weights(
+        expert_weights,
+        new_indices,
+        hidden_sizes,
+        world_size,
+        num_local_experts,
+    )
+
+
 @pytest.mark.parametrize(
     "world_size,num_layers,num_local_experts,num_logical_experts",
     [
@@ -305,78 +433,95 @@ def test_rearrange_expert_weights_with_redundancy(
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
+    distributed_run(
+        _test_rearrange_expert_weights_with_redundancy,
+        world_size,
+        num_layers,
+        num_local_experts,
+        num_logical_experts,
+    )
+
+
+def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
+
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
+
+    num_layers = 2
+    num_local_experts = 2
+    total_physical_experts = world_size * num_local_experts
+    num_logical_experts = total_physical_experts // 2  # Some redundancy
+    hidden_sizes = [32, 64]
+
+    # Create redundancy configuration
+    redundancy_config = [2] * num_logical_experts
+
+    # Same indices - no change
+    indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, redundancy_config
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
+    )
+
+    # Save original weights
+    original_weights = []
+    for layer_weights in expert_weights:
+        layer_copy = []
+        for weight in layer_weights:
+            layer_copy.append(weight.clone())
+        original_weights.append(layer_copy)
+
+    # Execute rearrangement (should be no change)
+    rearrange_expert_weights_inplace(
+        indices,
+        indices,  # Same indices
+        expert_weights,
+        ep_group,
+        is_profile=False,
+    )
+
+    # Verify that the weights have not changed
+    for layer in range(num_layers):
+        for weight_idx in range(len(hidden_sizes)):
+            torch.testing.assert_close(
+                expert_weights[layer][weight_idx],
+                original_weights[layer][weight_idx],
+                msg=f"""Layer {layer}, weight {weight_idx}
+ should remain unchanged""",
+            )
 
-    @worker_fn_wrapper
-    def worker_fn():
-        # Initialize model parallel (using tensor parallel as an entrypoint
-        # to expert parallel)
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
-
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
-
-        # Test parameters
-        total_physical_experts = world_size * num_local_experts
-        hidden_sizes = [32, 64]  # Two different weight matrices
-
-        # Create old expert indices (with redundancy)
-        redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-
-        old_indices = create_expert_indices_with_redundancy(
-            num_layers,
-            num_logical_experts,
-            total_physical_experts,
-            redundancy_config,
-        )
-
-        # Create new expert indices (with redundancy)
-        new_redundancy_config = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-        new_indices = create_expert_indices_with_redundancy(
-            num_layers,
-            num_logical_experts,
-            total_physical_experts,
-            new_redundancy_config,
-        )
-
-        # Create expert weights
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-        )
-
-        # Execute weight rearrangement
-        rearrange_expert_weights_inplace(
-            old_indices,
-            new_indices,
-            expert_weights,
-            ep_group,
-            is_profile=False,
-        )
 
-        # Verify the rearrangement result
-        verify_expert_weights_after_shuffle(
-            expert_weights,
-            new_indices,
-            hidden_sizes,
-            ep_rank,
-            num_local_experts,
-        )
+@pytest.mark.parametrize(
+    "world_size,num_layers,num_local_experts,num_logical_experts",
+    [
+        (2, 2, 2, 3),
+    ],
+)
+def test_async_transfer_layer_without_mtp(
+    world_size: int,
+    num_layers: int,
+    num_local_experts: int,
+    num_logical_experts: int,
+):
+    """Exercise async EPLB transfer path without MTP/spec decode."""
 
-        verify_redundant_experts_have_same_weights(
-            expert_weights,
-            new_indices,
-            hidden_sizes,
-            world_size,
-            num_local_experts,
-        )
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
 
-    distributed_run(worker_fn, world_size)
+    distributed_run(
+        _test_async_transfer_layer_without_mtp_worker,
+        world_size,
+        num_layers,
+        num_local_experts,
+        num_logical_experts,
+    )
 
 
 @pytest.mark.parametrize("world_size", [2, 4])
@@ -388,62 +533,69 @@ def test_rearrange_expert_weights_no_change(world_size):
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
+    distributed_run(_test_rearrange_expert_weights_no_change, world_size)
 
-    @worker_fn_wrapper
-    def worker_fn():
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
-
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
-
-        num_layers = 2
-        num_local_experts = 2
-        total_physical_experts = world_size * num_local_experts
-        num_logical_experts = total_physical_experts // 2  # Some redundancy
-        hidden_sizes = [32, 64]
 
-        # Create redundancy configuration
-        redundancy_config = [2] * num_logical_experts
+def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
+    set_env_vars_and_device(env)
+    ensure_model_parallel_initialized(
+        tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+    )
 
-        # Same indices - no change
-        indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, redundancy_config
-        )
-
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, indices
-        )
-
-        # Save original weights
-        original_weights = []
-        for layer_weights in expert_weights:
-            layer_copy = []
-            for weight in layer_weights:
-                layer_copy.append(weight.clone())
-            original_weights.append(layer_copy)
-
-        # Execute rearrangement (should be no change)
-        rearrange_expert_weights_inplace(
-            indices,
-            indices,  # Same indices
-            expert_weights,
-            ep_group,
-            is_profile=False,
-        )
-
-        # Verify that the weights have not changed
-        for layer in range(num_layers):
-            for weight_idx in range(len(hidden_sizes)):
-                torch.testing.assert_close(
-                    expert_weights[layer][weight_idx],
-                    original_weights[layer][weight_idx],
-                    msg=f"Layer {layer}, weight {weight_idx} should remain unchanged",
-                )
+    ep_group = get_tp_group().cpu_group
+    ep_rank = torch.distributed.get_rank()
+    device = torch.device(f"cuda:{ep_rank}")
 
-    distributed_run(worker_fn, world_size)
+    num_layers = 1
+    num_local_experts = 2
+    total_physical_experts = world_size * num_local_experts
+    num_logical_experts = total_physical_experts // 2
+    hidden_sizes = [32]
+
+    # Create different index distributions
+    old_redundancy = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+    new_redundancy = create_redundancy_config(
+        num_logical_experts, total_physical_experts
+    )
+
+    old_indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, old_redundancy
+    )
+    new_indices = create_expert_indices_with_redundancy(
+        num_layers, num_logical_experts, total_physical_experts, new_redundancy
+    )
+
+    expert_weights = create_expert_weights(
+        num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
+    )
+
+    # Save original weights
+    original_weights = []
+    for layer_weights in expert_weights:
+        layer_copy = []
+        for weight in layer_weights:
+            layer_copy.append(weight.clone())
+        original_weights.append(layer_copy)
+
+    # Execute profile mode rearrangement
+    rearrange_expert_weights_inplace(
+        old_indices,
+        new_indices,
+        expert_weights,
+        ep_group,
+        is_profile=True,  # Profile mode
+    )
+
+    # In profile mode, the weights should remain unchanged
+    for layer in range(num_layers):
+        for weight_idx in range(len(hidden_sizes)):
+            torch.testing.assert_close(
+                expert_weights[layer][weight_idx],
+                original_weights[layer][weight_idx],
+                msg="In profile mode, the weights should remain unchanged",
+            )
 
 
 @pytest.mark.parametrize("world_size", [2, 4])
@@ -452,66 +604,4 @@ def test_rearrange_expert_weights_profile_mode(world_size):
 
     if torch.cuda.device_count() < world_size:
         pytest.skip(f"Need at least {world_size} GPUs to run the test")
-
-    @worker_fn_wrapper
-    def worker_fn():
-        ensure_model_parallel_initialized(
-            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
-        )
-
-        ep_group = get_tp_group().cpu_group
-        ep_rank = torch.distributed.get_rank()
-        device = torch.device(f"cuda:{ep_rank}")
-
-        num_layers = 1
-        num_local_experts = 2
-        total_physical_experts = world_size * num_local_experts
-        num_logical_experts = total_physical_experts // 2
-        hidden_sizes = [32]
-
-        # Create different index distributions
-        old_redundancy = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-        new_redundancy = create_redundancy_config(
-            num_logical_experts, total_physical_experts
-        )
-
-        old_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, old_redundancy
-        )
-        new_indices = create_expert_indices_with_redundancy(
-            num_layers, num_logical_experts, total_physical_experts, new_redundancy
-        )
-
-        expert_weights = create_expert_weights(
-            num_layers, num_local_experts, hidden_sizes, ep_rank, device, old_indices
-        )
-
-        # Save original weights
-        original_weights = []
-        for layer_weights in expert_weights:
-            layer_copy = []
-            for weight in layer_weights:
-                layer_copy.append(weight.clone())
-            original_weights.append(layer_copy)
-
-        # Execute profile mode rearrangement
-        rearrange_expert_weights_inplace(
-            old_indices,
-            new_indices,
-            expert_weights,
-            ep_group,
-            is_profile=True,  # Profile mode
-        )
-
-        # In profile mode, the weights should remain unchanged
-        for layer in range(num_layers):
-            for weight_idx in range(len(hidden_sizes)):
-                torch.testing.assert_close(
-                    expert_weights[layer][weight_idx],
-                    original_weights[layer][weight_idx],
-                    msg="In profile mode, the weights should remain unchanged",
-                )
-
-    distributed_run(worker_fn, world_size)
+    distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)
diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py
new file mode 100644
index 000000000000..55f26519887a
--- /dev/null
+++ b/tests/distributed/test_eplb_fused_moe_layer.py
@@ -0,0 +1,285 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Test that the interaction between EPLB and FusedMoE Layer is okay
+
+from dataclasses import dataclass
+
+import pytest
+import torch
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
+from vllm.distributed.parallel_state import (
+    ensure_model_parallel_initialized,
+    get_tp_group,
+)
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+
+from .eplb_utils import distributed_run, set_env_vars_and_device
+
+
+@dataclass
+class TestConfig:
+    num_layers: int
+    num_experts: int
+    num_local_experts: int
+    num_topk: int
+    hidden_size: int
+    intermediate_size: int
+    weight_dtype: torch.dtype
+    weight_scale_dtype: torch.dtype | None
+    column_major_scales: bool
+
+
+def make_expert_weights(
+    layer_idx: int,
+    global_expert_idx: int,
+    global_num_experts: int,
+    tensor_shape: tuple[int, ...],
+    tensor_dtype: torch.dtype,
+    tensor_device: torch.device,
+    is_column_major: bool,
+) -> torch.Tensor:
+    assert len(tensor_shape) == 2
+
+    if is_column_major:
+        tensor_shape = (tensor_shape[1], tensor_shape[0])
+
+    x = torch.empty(tensor_shape, dtype=tensor_dtype, device=tensor_device)
+    value_offset = (layer_idx * global_num_experts + global_expert_idx) * x.numel()
+    x.view(-1).copy_(
+        torch.arange(
+            value_offset,
+            value_offset + x.numel(),
+            dtype=tensor_dtype,
+            device=tensor_device,
+        )
+    )
+
+    if is_column_major:
+        x = torch.transpose(x, 1, 0)
+        assert not x.is_contiguous()
+    return x
+
+
+def make_fused_moe_layer(
+    rank: int,
+    layer_idx: int,
+    test_config: TestConfig,
+) -> FusedMoE:
+    fml = FusedMoE(
+        num_experts=test_config.num_experts,
+        top_k=test_config.num_topk,
+        hidden_size=test_config.hidden_size,
+        intermediate_size=test_config.intermediate_size,
+        prefix=f"dummy_layer_{layer_idx}",
+        activation="silu",
+        is_act_and_mul=True,
+        params_dtype=test_config.weight_dtype,
+    )
+
+    device = torch.device(f"cuda:{rank}")
+
+    from functools import partial
+
+    _make_expert_weights = partial(
+        make_expert_weights,
+        layer_idx=layer_idx,
+        global_num_experts=test_config.num_experts,
+        tensor_device=device,
+    )
+
+    assert isinstance(fml.w13_weight.data, torch.Tensor)
+    assert isinstance(fml.w2_weight.data, torch.Tensor)
+    fml.w13_weight.data = fml.w13_weight.data.to(device=device)
+    fml.w2_weight.data = fml.w2_weight.data.to(device=device)
+    w13_weight = fml.w13_weight.data
+    w2_weight = fml.w2_weight.data
+    assert w13_weight.size(0) == test_config.num_local_experts
+    for i in range(test_config.num_local_experts):
+        g_i = rank * test_config.num_local_experts + i
+        w13_weight_e = w13_weight[i]
+        w2_weight_e = w2_weight[i]
+        w13_weight_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w13_weight_e.shape,
+                tensor_dtype=w13_weight_e.dtype,
+                is_column_major=False,
+            )
+        )
+        w2_weight_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w2_weight_e.shape,
+                tensor_dtype=w2_weight_e.dtype,
+                is_column_major=False,
+            )
+        )
+
+    block_size = 16
+
+    def block_quant_scales_shape(
+        shape: tuple[int, ...], is_column_major: bool
+    ) -> tuple[int, ...]:
+        assert len(shape) == 3
+        if not is_column_major:
+            return (shape[0], shape[1] // block_size, shape[2] // block_size)
+        else:
+            return (shape[0], shape[2] // block_size, shape[1] // block_size)
+
+    is_column_major = test_config.column_major_scales
+    w13_weight_scale_inv = torch.empty(
+        block_quant_scales_shape(w13_weight.shape, is_column_major),
+        dtype=test_config.weight_dtype,
+        device=device,
+    )
+    w2_weight_scale_inv = torch.empty(
+        block_quant_scales_shape(w2_weight.shape, is_column_major),
+        dtype=test_config.weight_dtype,
+        device=device,
+    )
+
+    for i in range(test_config.num_local_experts):
+        g_i = rank * test_config.num_local_experts + i
+        w13_s_e = w13_weight_scale_inv[i]
+        w2_s_e = w2_weight_scale_inv[i]
+        w13_s_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w13_s_e.shape,
+                tensor_dtype=w13_s_e.dtype,
+                # Fill data in row-major and then
+                # transpose if test_config requires col-major.
+                is_column_major=False,
+            )
+        )
+        w2_s_e.copy_(
+            _make_expert_weights(
+                global_expert_idx=g_i,
+                tensor_shape=w2_s_e.shape,
+                tensor_dtype=w2_s_e.dtype,
+                is_column_major=False,
+            )
+        )
+    if is_column_major:
+        w13_weight_scale_inv = torch.transpose(w13_weight_scale_inv, 1, 2)
+        w2_weight_scale_inv = torch.transpose(w2_weight_scale_inv, 1, 2)
+        assert not w13_weight_scale_inv.is_contiguous()
+        assert not w2_weight_scale_inv.is_contiguous()
+
+    # Add scales to the parameter list
+    fml.w13_weight_scale_inv = torch.nn.Parameter(
+        w13_weight_scale_inv, requires_grad=False
+    )
+    fml.w2_weight_scale_inv = torch.nn.Parameter(
+        w2_weight_scale_inv, requires_grad=False
+    )
+
+    return fml
+
+
+def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
+    # Initialize model parallel (using tensor parallel as an entrypoint
+    # to expert parallel)
+    set_env_vars_and_device(env)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.tensor_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_current_vllm_config(vllm_config):
+        ensure_model_parallel_initialized(
+            tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
+        )
+
+        ep_group = get_tp_group().cpu_group
+        ep_rank = torch.distributed.get_rank()
+
+        fml_layers = [
+            make_fused_moe_layer(ep_rank, layer_idx, test_config)
+            for layer_idx in range(test_config.num_layers)
+        ]
+        rank_expert_weights = [fml.get_expert_weights() for fml in fml_layers]
+
+        indices = torch.zeros(
+            test_config.num_layers, test_config.num_experts, dtype=torch.long
+        )
+        for lidx in range(test_config.num_layers):
+            indices[lidx] = torch.Tensor(range(test_config.num_experts))
+
+        shuffled_indices = torch.zeros_like(indices)
+        for lidx in range(test_config.num_layers):
+            shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
+
+        rearrange_expert_weights_inplace(
+            indices,
+            shuffled_indices,
+            rank_expert_weights,
+            ep_group,
+            is_profile=False,
+        )
+
+        num_local_experts = test_config.num_local_experts
+        num_global_experts = test_config.num_experts
+        for lidx, fml in enumerate(fml_layers):
+            for name, w in fml.named_parameters():
+                for e in range(num_local_experts):
+                    g_e = shuffled_indices[lidx][ep_rank * num_local_experts + e]
+                    ref = make_expert_weights(
+                        layer_idx=lidx,
+                        global_expert_idx=int(g_e.item()),
+                        global_num_experts=num_global_experts,
+                        tensor_shape=w[e].shape,
+                        tensor_dtype=w[e].dtype,
+                        tensor_device=w[e].device,
+                        is_column_major=not w[e].is_contiguous(),
+                    )
+                    assert w[e].shape == ref.shape and w[e].stride() == ref.stride(), (
+                        f"w[{e}] {w[e].size()} {w[e].stride()} vs "
+                        f"ref {ref.size()} {ref.stride()}"
+                    )
+                    torch.testing.assert_close(w[e], ref)
+
+
+@pytest.mark.parametrize("world_size", [2])
+@pytest.mark.parametrize("num_layers", [4])
+@pytest.mark.parametrize("num_experts", [16])
+@pytest.mark.parametrize("hidden_size", [256])
+@pytest.mark.parametrize("intermediate_size", [256])
+@pytest.mark.parametrize("column_major_scales", [True, False])
+def test_eplb_fml(
+    world_size: int,
+    num_layers: int,
+    num_experts: int,
+    hidden_size: int,
+    intermediate_size: int,
+    column_major_scales: bool,
+):
+    if torch.cuda.device_count() < world_size:
+        pytest.skip(f"Need at least {world_size} GPUs to run the test")
+
+    num_local_experts = num_experts // world_size
+    num_topk = 4
+    # The dtypes are fine as we are essentially just checking data-copies
+    weight_dtype = torch.bfloat16
+    weight_scale_dtype = torch.bfloat16
+
+    test_config = TestConfig(
+        num_layers=num_layers,
+        num_experts=num_experts,
+        num_local_experts=num_local_experts,
+        num_topk=num_topk,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        weight_dtype=weight_dtype,
+        weight_scale_dtype=weight_scale_dtype,
+        column_major_scales=column_major_scales,
+    )
+
+    distributed_run(
+        _test_eplb_fml,
+        world_size,
+        test_config,
+    )
diff --git a/tests/distributed/test_eplb_spec_decode.py b/tests/distributed/test_eplb_spec_decode.py
index 11e23f128f33..c055b7a3f6dd 100644
--- a/tests/distributed/test_eplb_spec_decode.py
+++ b/tests/distributed/test_eplb_spec_decode.py
@@ -10,10 +10,11 @@
 
 def get_model_args(
     model_name: str,
-    spec_model_name: str,
+    spec_model_name: str | None,
     spec_method: str,
     tp_size: int,
     model_max_len: int,
+    use_async: bool = False,
 ) -> dict:
     speculative_config = {
         "method": spec_method,
@@ -37,6 +38,8 @@ def get_model_args(
         "enable_eplb": True,
         "max_model_len": model_max_len,
     }
+    if use_async:
+        model_args["eplb_config"] = {"use_async": True}
     return model_args
 
 
@@ -94,3 +97,37 @@ def test_eplb_spec_decode(
         measured_value - RTOL < expected_gsm8k_value
         and measured_value + RTOL > expected_gsm8k_value
     ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
+
+
+@large_gpu_mark(min_gb=80)
+def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
+    """
+    Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
+    """
+
+    TASK = "gsm8k"
+    FILTER = "exact_match,strict-match"
+    RTOL = 0.03
+    expected_gsm8k_value = 0.86
+
+    model_args = get_model_args(
+        model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
+        spec_model_name=None,
+        spec_method="mtp",
+        tp_size=4,
+        model_max_len=4096,
+        use_async=True,
+    )
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks=TASK,
+        batch_size=64,
+        num_fewshot=8,
+    )
+    measured_value = results["results"][TASK][FILTER]
+    assert (
+        measured_value - RTOL < expected_gsm8k_value
+        and measured_value + RTOL > expected_gsm8k_value
+    ), f"Expected: {expected_gsm8k_value} |  Measured: {measured_value}"
diff --git a/tests/distributed/test_multiproc_executor.py b/tests/distributed/test_multiproc_executor.py
new file mode 100644
index 000000000000..e741a79bc4ed
--- /dev/null
+++ b/tests/distributed/test_multiproc_executor.py
@@ -0,0 +1,437 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""
+Integration tests for MultiprocExecutor at the executor level.
+This test directly tests the executor without going through the LLM interface,
+focusing on executor initialization, RPC calls, and distributed execution.
+"""
+
+import multiprocessing
+import os
+
+from tests.utils import multi_gpu_test
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import get_open_port
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
+
+MODEL = "facebook/opt-125m"
+
+
+def create_vllm_config(
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    max_model_len: int = 256,
+    gpu_memory_utilization: float = 0.3,
+    distributed_executor_backend: str = "mp",
+    nnodes: int = 1,
+    node_rank: int = 0,
+    master_port: int = 0,
+) -> VllmConfig:
+    """Create a VllmConfig for testing using EngineArgs."""
+    engine_args = EngineArgs(
+        model=MODEL,
+        tensor_parallel_size=tensor_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        max_model_len=max_model_len,
+        gpu_memory_utilization=gpu_memory_utilization,
+        distributed_executor_backend=distributed_executor_backend,
+        enforce_eager=True,
+    )
+    vllm_config = engine_args.create_engine_config()
+
+    # Override distributed node settings if needed
+    if nnodes > 1 or node_rank > 0:
+        vllm_config.parallel_config.nnodes = nnodes
+        vllm_config.parallel_config.node_rank = node_rank
+        vllm_config.parallel_config.master_port = master_port
+    if nnodes > 1:
+        vllm_config.parallel_config.disable_custom_all_reduce = True
+
+    return vllm_config
+
+
+def create_test_scheduler_output(num_requests: int = 1) -> SchedulerOutput:
+    """Create a minimal SchedulerOutput for testing."""
+    # This is a simplified version - in practice you'd need proper
+    # SchedulerOutput construction based on the actual vLLM v1 API
+    return SchedulerOutput(
+        scheduled_new_reqs=[],
+        scheduled_resumed_reqs=[],
+        scheduled_running_reqs=[],
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+    )
+
+
+def test_multiproc_executor_initialization():
+    """Test that MultiprocExecutor can be initialized with proper config."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor - this should initialize workers
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 1, "World size should be 1 for single GPU"
+    assert executor.local_world_size == 1, "Local world size should be 1"
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) == 1, "Should have 1 worker for single GPU"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_initialization_tensor_parallel():
+    """Test MultiprocExecutor initialization with tensor parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor properties
+    assert executor.world_size == 2, "World size should be 2 for TP=2"
+    assert executor.local_world_size == 2, "Local world size should be 2"
+    assert len(executor.workers) == 2, "Should have 2 workers for TP=2"
+
+    # Verify output rank calculation
+    output_rank = executor._get_output_rank()
+    assert output_rank == 0, "Output rank should be 0 for TP=2, PP=1"
+
+    # Clean up
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_collective_rpc():
+    """Test collective RPC calls to all workers."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    # Create executor
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test check_health RPC - should work without errors
+        executor.check_health()
+
+        # Test that RPC works correctly
+        # Note: We're just testing that the RPC mechanism works,
+        # not testing actual model execution here
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_failure_callback():
+    """Test failure callback registration and invocation."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test callback registration
+        callback_invoked = []
+
+        def test_callback():
+            callback_invoked.append(True)
+
+        # Register callback
+        executor.register_failure_callback(test_callback)
+
+        # Callback should not be invoked yet
+        assert len(callback_invoked) == 0, "Callback should not be invoked immediately"
+
+        # Simulate failure
+        executor.is_failed = True
+
+        # Register another callback - should be invoked immediately
+        executor.register_failure_callback(test_callback)
+        assert len(callback_invoked) == 1, (
+            "Callback should be invoked when executor is failed"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_worker_monitor():
+    """Test that worker monitor is set up correctly."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify all worker processes are alive
+        for worker in executor.workers:
+            assert worker.proc.is_alive(), f"Worker rank {worker.rank} should be alive"
+
+        # Verify executor is not in failed state
+        assert not executor.is_failed, "Executor should not be in failed state"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+        # After shutdown, workers should be terminated
+        import time
+
+        time.sleep(0.5)  # Give processes time to terminate
+        for worker in executor.workers:
+            assert not worker.proc.is_alive(), (
+                f"Worker rank {worker.rank} should terminate after shutdown"
+            )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_multiproc_executor_get_response_message_queues():
+    """Test message queue retrieval for different ranks."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Get all message queues
+        all_queues = executor.get_response_mqs()
+        assert len(all_queues) == 2, "Should have 2 message queues for 2 workers"
+
+        # Get message queue for specific rank
+        rank0_queue = executor.get_response_mqs(unique_reply_rank=0)
+        assert len(rank0_queue) == 1, "Should have 1 message queue for rank 0"
+
+        rank1_queue = executor.get_response_mqs(unique_reply_rank=1)
+        assert len(rank1_queue) == 1, "Should have 1 message queue for rank 1"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_shutdown_cleanup():
+    """Test that shutdown properly cleans up resources."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    # Verify executor is set up
+    assert hasattr(executor, "workers"), "Executor should have workers"
+    assert len(executor.workers) > 0, "Should have at least one worker"
+
+    # Shutdown
+    executor.shutdown()
+
+    # Verify cleanup
+    import time
+
+    time.sleep(0.5)  # Give processes time to terminate
+
+    for worker in executor.workers:
+        assert not worker.proc.is_alive(), "Worker processes should be terminated"
+
+    # Verify shutdown event is set
+    assert executor.shutdown_event.is_set(), "Shutdown event should be set"
+
+    # Multiple shutdowns should be safe (idempotent)
+    executor.shutdown()
+    executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_pipeline_parallel():
+    """Test MultiprocExecutor with pipeline parallelism."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=2,
+        pipeline_parallel_size=2,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Verify executor properties
+        assert executor.world_size == 4, "World size should be 4 for TP=2, PP=2"
+        assert len(executor.workers) == 4, "Should have 4 workers"
+
+        # Verify output rank calculation
+        # For TP=2, PP=2: output should be from the last PP stage (ranks 2-3)
+        # Specifically rank 2 (first rank of last PP stage)
+        output_rank = executor._get_output_rank()
+        assert output_rank == 2, "Output rank should be 2 (first rank of last PP stage)"
+
+        # Verify max_concurrent_batches for pipeline parallel
+        assert executor.max_concurrent_batches == 2, (
+            "Max concurrent batches should equal PP size"
+        )
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+def test_multiproc_executor_properties():
+    """Test various executor properties and configurations."""
+    vllm_config = create_vllm_config(
+        tensor_parallel_size=1,
+        pipeline_parallel_size=1,
+    )
+
+    executor = MultiprocExecutor(vllm_config=vllm_config)
+
+    try:
+        # Test supports_pp property
+        assert MultiprocExecutor.supports_pp is True, (
+            "MultiprocExecutor should support pipeline parallelism"
+        )
+
+        # Test world_size calculation
+        assert executor.world_size == (
+            executor.parallel_config.tensor_parallel_size
+            * executor.parallel_config.pipeline_parallel_size
+        ), "World size should equal TP * PP"
+
+        # Test local_world_size calculation
+        assert executor.local_world_size == (
+            executor.parallel_config.world_size // executor.parallel_config.nnodes
+        ), "Local world size should be world_size / nnodes"
+
+    finally:
+        # Clean up
+        executor.shutdown()
+
+
+@multi_gpu_test(num_gpus=4)
+def test_multiproc_executor_multi_node():
+    """
+    Test MultiprocExecutor with multi-node configuration.
+    This simulates 2 nodes with TP=4:
+    - Node 0 (rank 0): Uses GPUs 0,1 (CUDA_VISIBLE_DEVICES=0,1) with TP=2
+    - Node 1 (rank 1): Uses GPUs 2,3 (CUDA_VISIBLE_DEVICES=2,3) with TP=2
+    Total world_size = 4, nnodes = 2
+    """
+    port = get_open_port()
+    # symm_mem does not work for simulating multi instance in single node
+    os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
+
+    def run_node(node_rank: int, result_queue: multiprocessing.Queue, port: int):
+        """Run a single node's executor."""
+        executor = None
+        try:
+            # Set CUDA_VISIBLE_DEVICES for this node
+            if node_rank == 0:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+            else:
+                os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
+
+            # Create config for this node
+            vllm_config = create_vllm_config(
+                tensor_parallel_size=4,  # Total TP across all nodes
+                pipeline_parallel_size=1,
+                nnodes=2,  # 2 nodes
+                node_rank=node_rank,
+                master_port=port,  # same port
+            )
+
+            # Create executor for this node
+            executor = MultiprocExecutor(vllm_config=vllm_config)
+
+            # Verify node-specific properties
+            assert executor.world_size == 4, (
+                f"World size should be 4 on node {node_rank}"
+            )
+            assert executor.local_world_size == 2, (
+                f"Local world size should be 2 on node {node_rank}"
+            )
+            assert len(executor.workers) == 2, (
+                f"Should have 2 local workers on node {node_rank}"
+            )
+
+            # Verify worker ranks are correct for this node
+            expected_ranks = [node_rank * 2, node_rank * 2 + 1]
+            actual_ranks = sorted([w.rank for w in executor.workers])
+            assert actual_ranks == expected_ranks, (
+                f"Node {node_rank} should have workers "
+                f"with ranks {expected_ranks}, got {actual_ranks}"
+            )
+            # Verify all workers are alive
+            for worker in executor.workers:
+                assert worker.proc.is_alive(), (
+                    f"Worker rank {worker.rank} should be alive on node {node_rank}"
+                )
+            # executor.gen
+            # Put success result in queue BEFORE shutdown to avoid hanging
+            result_queue.put({"node": node_rank, "success": True})
+            import time
+
+            time.sleep(2)
+            executor.shutdown()
+        except Exception as e:
+            # Put failure result in queue
+            result_queue.put({"node": node_rank, "success": False, "error": str(e)})
+            raise e
+        finally:
+            if executor is not None:
+                executor.shutdown()
+
+    # Create a queue to collect results from both processes
+    result_queue: multiprocessing.Queue[dict[str, int | bool]] = multiprocessing.Queue()
+
+    # Start both node processes
+    processes = []
+    for node_rank in range(2):
+        p = multiprocessing.Process(
+            target=run_node,
+            args=(node_rank, result_queue, port),
+            name=f"Node{node_rank}",
+        )
+        p.start()
+        processes.append(p)
+
+    # Wait for both processes to complete
+    all_completed = True
+    for p in processes:
+        p.join(timeout=60)
+        if p.is_alive():
+            p.terminate()
+            p.join(timeout=20)
+            if p.is_alive():
+                p.kill()
+                p.join()
+            all_completed = False
+
+    # Check results from both nodes
+    results: list[dict[str, int | bool]] = []
+    while len(results) < 2:
+        try:
+            result = result_queue.get(timeout=1)
+            results.append(result)
+        except Exception:
+            pass
+    assert all_completed, "Not all processes completed successfully"
+    assert len(results) == 2, f"Expected 2 results, got {len(results)}"
+    assert results[0]["success"], f"Node 0 failed: {results[0]}"
+    assert results[1]["success"], f"Node 1 failed: {results[1]}"
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 0ab94d30858f..89f035d2cdd6 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -130,6 +130,7 @@ def iter_params(self, model_id: str):
     "inceptionai/jais-13b-chat": PPTestSettings.fast(),
     "ai21labs/Jamba-tiny-dev": PPTestSettings.fast(),
     "pfnet/plamo-2-1b": PPTestSettings.fast(),
+    "pfnet/plamo-3-nict-2b-base": PPTestSettings.fast(),
     "meta-llama/Llama-3.2-1B-Instruct": PPTestSettings.detailed(),
     # Tests TransformersForCausalLM
     "hmellor/Ilama-3.2-1B": PPTestSettings.fast(),
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index c3085beeb356..c7c9d0602def 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import multiprocessing
 import os
 
+import multiprocess as mp
 import numpy as np
 import pytest
 import torch
@@ -20,10 +20,12 @@
 )
 from vllm.utils.system_utils import update_environment_variables
 
+mp.set_start_method("spawn", force=True)
+
 
 def distributed_run(fn, world_size):
     number_of_processes = world_size
-    processes: list[multiprocessing.Process] = []
+    processes: list[mp.Process] = []
     for i in range(number_of_processes):
         env: dict[str, str] = {}
         env["RANK"] = str(i)
@@ -32,7 +34,7 @@ def distributed_run(fn, world_size):
         env["LOCAL_WORLD_SIZE"] = str(number_of_processes)
         env["MASTER_ADDR"] = "localhost"
         env["MASTER_PORT"] = "12345"
-        p = multiprocessing.Process(target=fn, args=(env,))
+        p = mp.Process(target=fn, args=(env,))
         processes.append(p)
         p.start()
 
diff --git a/tests/distributed/test_sequence_parallel.py b/tests/distributed/test_sequence_parallel.py
index 94b2b51211a6..f38c509775ed 100644
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -18,6 +18,7 @@
 from vllm.config.compilation import CompilationMode
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 from ..models.registry import HF_EXAMPLE_MODELS
@@ -161,6 +162,7 @@ def _compare_sp(
     test_options: SPTestOptions,
     num_gpus_available: int,
     use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
     *,
     method: Literal["generate", "encode"],
     is_multimodal: bool,
@@ -244,10 +246,10 @@ def _compare_sp(
 
     compilation_config = {
         "mode": CompilationMode.VLLM_COMPILE,
-        "custom_ops": ["+rms_norm"],
         "compile_sizes": [4, 8],
         "pass_config": {
             "enable_sequence_parallelism": True,
+            "enable_async_tp": enable_async_tp,
             "enable_fusion": enable_fusion,
             "enable_noop": True,
         },
@@ -307,6 +309,7 @@ def _compare_sp(
     ],
 )
 @pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+@pytest.mark.parametrize("enable_async_tp", [False])  # TODO: enable async TP
 @create_new_process_for_each_test()
 def test_tp_sp_generation(
     model_id: str,
@@ -316,10 +319,19 @@ def test_tp_sp_generation(
     test_options: SPTestOptions,
     num_gpus_available,
     use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
 ):
     if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
         pytest.skip("inductor graph partition is only available in PyTorch 2.9+")
 
+    # Skip FP8 SP-only test on sm89 (compute capability 8.9)
+    if (
+        "fp8" in model_id.lower()
+        and current_platform.get_device_capability() < (9, 0)
+        and (not enable_async_tp)
+    ):
+        pytest.skip("FP8 reduction support begins with sm90 capable devices.")
+
     _compare_sp(
         model_id,
         parallel_setup,
@@ -328,6 +340,7 @@ def test_tp_sp_generation(
         test_options,
         num_gpus_available,
         use_inductor_graph_partition,
+        enable_async_tp=enable_async_tp,
         method="generate",
         is_multimodal=False,
     )
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index 472b1487ef44..e46f118f8e84 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -222,41 +222,65 @@ def test_media_io_kwargs_parser(arg, expected):
     assert args.media_io_kwargs == expected
 
 
-def test_compilation_config():
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-O", "1"], "1"),
+        (["-O", "2"], "2"),
+        (["-O", "3"], "3"),
+        (["-O0"], "0"),
+        (["-O1"], "1"),
+        (["-O2"], "2"),
+        (["-O3"], "3"),
+    ],
+)
+def test_optimization_level(args, expected):
+    """
+    Test space-separated optimization levels (-O 1, -O 2, -O 3) map to
+    optimization_level.
+    """
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.optimization_level == expected
+    assert parsed_args.compilation_config.mode is None
 
-    # default value
-    args = parser.parse_args([])
-    assert args.compilation_config == CompilationConfig()
 
-    # set to O3
-    args = parser.parse_args(["-O0"])
-    assert args.compilation_config.mode == 0
+@pytest.mark.parametrize(
+    ("args", "expected"),
+    [
+        (["-cc.mode=0"], 0),
+        (["-cc.mode=1"], 1),
+        (["-cc.mode=2"], 2),
+        (["-cc.mode=3"], 3),
+    ],
+)
+def test_mode_parser(args, expected):
+    """
+    Test compilation config modes (-cc.mode=int) map to compilation_config.
+    """
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    parsed_args = parser.parse_args(args)
+    assert parsed_args.compilation_config.mode == expected
 
-    # set to O 3 (space)
-    args = parser.parse_args(["-O", "1"])
-    assert args.compilation_config.mode == 1
 
-    # set to O 3 (equals)
-    args = parser.parse_args(["-O=2"])
-    assert args.compilation_config.mode == 2
+def test_compilation_config():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
 
-    # set to O.mode 3
-    args = parser.parse_args(["-O.mode", "3"])
-    assert args.compilation_config.mode == 3
+    # default value
+    args = parser.parse_args([])
+    assert args.compilation_config == CompilationConfig()
 
     # set to string form of a dict
     args = parser.parse_args(
         [
-            "-O",
-            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": false}',
+            "-cc",
+            '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], "backend": "eager"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and not args.compilation_config.use_inductor
+        and args.compilation_config.backend == "eager"
     )
 
     # set to string form of a dict
@@ -264,13 +288,13 @@ def test_compilation_config():
         [
             "--compilation-config="
             '{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8], '
-            '"use_inductor": true}',
+            '"backend": "inductor"}',
         ]
     )
     assert (
         args.compilation_config.mode == 3
         and args.compilation_config.cudagraph_capture_sizes == [1, 2, 4, 8]
-        and args.compilation_config.use_inductor
+        and args.compilation_config.backend == "inductor"
     )
 
 
@@ -278,8 +302,9 @@ def test_prefix_cache_default():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
     args = parser.parse_args([])
 
+    # should be None by default (depends on model).
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert not engine_args.enable_prefix_caching, "prefix caching defaults to off."
+    assert engine_args.enable_prefix_caching is None
 
     # with flag to turn it on.
     args = parser.parse_args(["--enable-prefix-caching"])
diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py
index a52e1cb7df33..9ab50c44aa4a 100644
--- a/tests/entrypoints/conftest.py
+++ b/tests/entrypoints/conftest.py
@@ -188,11 +188,11 @@ def sample_sql_statements():
 
 
 @pytest.fixture(scope="session")
-def zephyr_lora_files():
-    """Download zephyr LoRA files once per test session."""
+def qwen3_lora_files():
+    """Download Qwen3 LoRA files once per test session."""
     from huggingface_hub import snapshot_download
 
-    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py
index 25e663f3af0e..539ff89abe9c 100644
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -23,6 +23,16 @@
         "max_num_seqs": 64,
         "tensor_parallel_size": 1,
     },
+    {
+        "model": "Qwen/Qwen3-0.6B",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.50,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+        "tokenizer": "Qwen/Qwen3-4B",
+    },
     {
         "model": "mistralai/Mistral-7B-Instruct-v0.1",
         "enforce_eager": True,
diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py
index e63a6f10cbc7..3d581a300b6a 100644
--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -16,7 +16,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index d25958f602b3..b2909f21e4dd 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -19,6 +19,14 @@
 MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
 
 
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    """Download zephyr LoRA files once per test session."""
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
+
+
 @pytest.fixture(scope="module")
 def server(zephyr_lora_files):  # noqa: F811
     args = [
diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py
index 608e509e59e8..f5c412107775 100644
--- a/tests/entrypoints/openai/test_chunked_prompt.py
+++ b/tests/entrypoints/openai/test_chunked_prompt.py
@@ -8,7 +8,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -20,7 +20,6 @@ def server():
         "--max-model-len",
         "8192",
         "--enforce-eager",
-        # lora config below
         "--max-num-seqs",
         "128",
         "--enable-chunked-prefill",
diff --git a/tests/entrypoints/openai/test_enable_force_include_usage.py b/tests/entrypoints/openai/test_enable_force_include_usage.py
index 3ddf2308eb1d..9d527c45c1fa 100644
--- a/tests/entrypoints/openai/test_enable_force_include_usage.py
+++ b/tests/entrypoints/openai/test_enable_force_include_usage.py
@@ -17,7 +17,7 @@ def chat_server_with_force_include_usage(request):  # noqa: F811
         "128",
         "--enforce-eager",
         "--max-num-seqs",
-        "1",
+        "4",
         "--enable-force-include-usage",
         "--port",
         "55857",
@@ -78,7 +78,7 @@ def transcription_server_with_force_include_usage():
         "--dtype",
         "bfloat16",
         "--max-num-seqs",
-        "1",
+        "4",
         "--enforce-eager",
         "--enable-force-include-usage",
         "--gpu-memory-utilization",
diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py
index c74f805961bc..22461f470db0 100644
--- a/tests/entrypoints/openai/test_lora_adapters.py
+++ b/tests/entrypoints/openai/test_lora_adapters.py
@@ -13,9 +13,8 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
-# technically this needs Mistral-7B-v0.1 as base, but we're not testing
-# generation quality here
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+
 
 BADREQUEST_CASES = [
     (
@@ -33,11 +32,11 @@
 
 
 @pytest.fixture(scope="module", params=[True])
-def server_with_lora_modules_json(request, zephyr_lora_files):
+def server_with_lora_modules_json(request, qwen3_lora_files):
     # Define the json format LoRA module configurations
     lora_module_1 = {
-        "name": "zephyr-lora",
-        "path": zephyr_lora_files,
+        "name": "qwen3-lora",
+        "path": qwen3_lora_files,
         "base_model_name": MODEL_NAME,
     }
 
@@ -74,7 +73,7 @@ async def client(server_with_lora_modules_json):
 
 
 @pytest.mark.asyncio
-async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_static_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
@@ -82,17 +81,17 @@ async def test_static_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
     assert served_model.parent is None
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
     assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert lora_models[0].id == "qwen3-lora"
 
 
 @pytest.mark.asyncio
-async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files):
     response = await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "zephyr-lora-3", "lora_path": zephyr_lora_files},
+        body={"lora_name": "qwen3-lora-3", "lora_path": qwen3_lora_files},
     )
     # Ensure adapter loads before querying /models
     assert "success" in response
@@ -100,9 +99,9 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, zephyr_lora_file
     models = await client.models.list()
     models = models.data
     dynamic_lora_model = models[-1]
-    assert dynamic_lora_model.root == zephyr_lora_files
+    assert dynamic_lora_model.root == qwen3_lora_files
     assert dynamic_lora_model.parent == MODEL_NAME
-    assert dynamic_lora_model.id == "zephyr-lora-3"
+    assert dynamic_lora_model.id == "qwen3-lora-3"
 
 
 @pytest.mark.asyncio
@@ -134,7 +133,7 @@ async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI, tmp_path):
 async def test_dynamic_lora_badrequests(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
     test_name: str,
     config_change: dict,
     expected_error: str,
@@ -143,7 +142,7 @@ async def test_dynamic_lora_badrequests(
     test_dir = tmp_path / test_name
 
     # Copy adapter files
-    shutil.copytree(zephyr_lora_files, test_dir)
+    shutil.copytree(qwen3_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -167,7 +166,7 @@ async def test_dynamic_lora_badrequests(
 
 @pytest.mark.asyncio
 async def test_multiple_lora_adapters(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     """Validate that many loras can be dynamically registered and inferenced
     with concurrently"""
@@ -178,7 +177,7 @@ async def load_and_run_adapter(adapter_name: str):
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
@@ -199,7 +198,7 @@ async def load_and_run_adapter(adapter_name: str):
 
 @pytest.mark.asyncio
 async def test_loading_invalid_adapters_does_not_break_others(
-    client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files
+    client: openai.AsyncOpenAI, tmp_path, qwen3_lora_files
 ):
     invalid_files = tmp_path / "invalid_files"
     invalid_files.mkdir()
@@ -215,7 +214,7 @@ async def run_good_requests(client):
         while not stop_good_requests_event.is_set():
             try:
                 batch = await client.completions.create(
-                    model="zephyr-lora",
+                    model="qwen3-lora",
                     prompt=["Hello there", "Foo bar bazz buzz"],
                     max_tokens=5,
                 )
@@ -254,7 +253,7 @@ async def run_good_requests(client):
     await client.post(
         "load_lora_adapter",
         cast_to=str,
-        body={"lora_name": "valid", "lora_path": zephyr_lora_files},
+        body={"lora_name": "valid", "lora_path": qwen3_lora_files},
     )
     await client.completions.create(
         model="valid",
@@ -267,7 +266,7 @@ async def run_good_requests(client):
 async def test_beam_search_with_lora_adapters(
     client: openai.AsyncOpenAI,
     tmp_path,
-    zephyr_lora_files,
+    qwen3_lora_files,
 ):
     """Validate that async beam search can be used with lora."""
 
@@ -275,7 +274,7 @@ async def load_and_run_adapter(adapter_name: str):
         await client.post(
             "load_lora_adapter",
             cast_to=str,
-            body={"lora_name": adapter_name, "lora_path": str(zephyr_lora_files)},
+            body={"lora_name": adapter_name, "lora_path": str(qwen3_lora_files)},
         )
         for _ in range(3):
             await client.completions.create(
diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py
index b05fa379c69f..4856cafef44b 100644
--- a/tests/entrypoints/openai/test_lora_resolvers.py
+++ b/tests/entrypoints/openai/test_lora_resolvers.py
@@ -114,7 +114,7 @@ async def mock_generate(*args, **kwargs):
     mock_engine.add_lora.reset_mock()
 
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     models = OpenAIServingModels(
diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py
index dbcec9d31fc9..65a6fd20bd0d 100644
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -16,6 +16,7 @@
 
 from vllm import version
 
+from ...conftest import LocalAssetServer
 from ...utils import RemoteOpenAIServer
 
 MODELS = {
@@ -69,7 +70,6 @@ async def client(server):
 
 
 _PROMPT = "Hello my name is Robert and I love magic"
-_IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
 
 def _get_expected_values(num_requests: int, prompt_ids: list[int], max_tokens: int):
@@ -183,9 +183,6 @@ async def test_metrics_counts(
 EXPECTED_METRICS_V1 = [
     "vllm:num_requests_running",
     "vllm:num_requests_waiting",
-    "vllm:gpu_cache_usage_perc",
-    "vllm:gpu_prefix_cache_queries",
-    "vllm:gpu_prefix_cache_hits",
     "vllm:kv_cache_usage_perc",
     "vllm:prefix_cache_queries",
     "vllm:prefix_cache_hits",
@@ -250,6 +247,7 @@ async def test_metrics_counts(
 
 @pytest.mark.asyncio
 async def test_metrics_exist(
+    local_asset_server: LocalAssetServer,
     server: RemoteOpenAIServer,
     client: openai.AsyncClient,
     model_key: str,
@@ -265,13 +263,21 @@ async def test_metrics_exist(
             temperature=0.0,
         )
     else:
+        # https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg
         await client.chat.completions.create(
             model=model_name,
             messages=[
                 {
                     "role": "user",
                     "content": [
-                        {"type": "image_url", "image_url": {"url": _IMAGE_URL}},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": local_asset_server.url_for(
+                                    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                ),
+                            },
+                        },
                         {"type": "text", "text": "What's in this image?"},
                     ],
                 }
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 7d2968d96506..e5af11edf7fa 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -8,13 +8,13 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 # technically this needs Mistral-7B-v0.1 as base, but we're not testing
 # generation quality here
 
 
 @pytest.fixture(scope="module")
-def server(zephyr_lora_files):
+def server(qwen3_lora_files):
     args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -25,7 +25,7 @@ def server(zephyr_lora_files):
         # lora config below
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
@@ -45,12 +45,12 @@ async def client(server):
 
 
 @pytest.mark.asyncio
-async def test_check_models(client: openai.AsyncOpenAI, zephyr_lora_files):
+async def test_check_models(client: openai.AsyncOpenAI, qwen3_lora_files):
     models = await client.models.list()
     models = models.data
     served_model = models[0]
     lora_models = models[1:]
     assert served_model.id == MODEL_NAME
     assert served_model.root == MODEL_NAME
-    assert all(lora_model.root == zephyr_lora_files for lora_model in lora_models)
-    assert lora_models[0].id == "zephyr-lora"
+    assert all(lora_model.root == qwen3_lora_files for lora_model in lora_models)
+    assert lora_models[0].id == "qwen3-lora"
diff --git a/tests/entrypoints/openai/test_orca_metrics.py b/tests/entrypoints/openai/test_orca_metrics.py
index 1ed44a33bf81..1ce043df0cd8 100644
--- a/tests/entrypoints/openai/test_orca_metrics.py
+++ b/tests/entrypoints/openai/test_orca_metrics.py
@@ -8,7 +8,7 @@
 from ...utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
@@ -110,8 +110,9 @@ async def test_single_completion(client: openai.AsyncOpenAI):
     choice = completion.choices[0]
     assert len(choice.text) >= 5
     assert choice.finish_reason == "length"
+    # When using Qwen3-0.6B, prompt tokens=[9707, 11, 847, 829, 374]
     assert completion.usage == openai.types.CompletionUsage(
-        completion_tokens=5, prompt_tokens=6, total_tokens=11
+        completion_tokens=5, prompt_tokens=5, total_tokens=10
     )
 
     # test using token IDs
diff --git a/tests/entrypoints/openai/test_response_api_mcp_tools.py b/tests/entrypoints/openai/test_response_api_mcp_tools.py
index 0dc2430caef7..cd338b5555c5 100644
--- a/tests/entrypoints/openai/test_response_api_mcp_tools.py
+++ b/tests/entrypoints/openai/test_response_api_mcp_tools.py
@@ -4,6 +4,9 @@
 import pytest
 import pytest_asyncio
 from openai import OpenAI
+from openai_harmony import ToolDescription, ToolNamespaceConfig
+
+from vllm.entrypoints.tool_server import MCPToolServer
 
 from ...utils import RemoteOpenAIServer
 
@@ -111,6 +114,48 @@ async def test_mcp_tool_env_flag_enabled(mcp_enabled_client: OpenAI, model_name:
         )
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_mcp_tool_with_allowed_tools_star(
+    mcp_enabled_client: OpenAI, model_name: str
+):
+    """Test MCP tool with allowed_tools=['*'] to select all available tools.
+
+    This E2E test verifies that the "*" wildcard works end-to-end.
+    See test_serving_responses.py for detailed unit tests of "*" normalization.
+    """
+    response = await mcp_enabled_client.responses.create(
+        model=model_name,
+        input=(
+            "Execute the following code: "
+            "import random; print(random.randint(1, 1000000))"
+        ),
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        tools=[
+            {
+                "type": "mcp",
+                "server_label": "code_interpreter",
+                "server_url": "http://localhost:8888",
+                # Using "*" to allow all tools from this MCP server
+                "allowed_tools": ["*"],
+            }
+        ],
+        extra_body={"enable_response_messages": True},
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # Verify tool calls work with allowed_tools=["*"]
+    tool_call_found = False
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        if recipient and recipient.startswith("python"):
+            tool_call_found = True
+            break
+    assert tool_call_found, "Should have found at least one Python tool call with '*'"
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_name: str):
@@ -159,3 +204,58 @@ async def test_mcp_tool_env_flag_disabled(mcp_disabled_client: OpenAI, model_nam
         assert message.get("author").get("role") != "developer", (
             "No developer messages should be present without a valid tool"
         )
+
+
+def test_get_tool_description():
+    """Test MCPToolServer.get_tool_description filtering logic.
+
+    Note: The wildcard "*" is normalized to None by
+    _extract_allowed_tools_from_mcp_requests before reaching this layer,
+    so we only test None and specific tool filtering here.
+    See test_serving_responses.py for "*" normalization tests.
+    """
+    pytest.importorskip("mcp")
+
+    server = MCPToolServer()
+    tool1 = ToolDescription.new(
+        name="tool1", description="First", parameters={"type": "object"}
+    )
+    tool2 = ToolDescription.new(
+        name="tool2", description="Second", parameters={"type": "object"}
+    )
+    tool3 = ToolDescription.new(
+        name="tool3", description="Third", parameters={"type": "object"}
+    )
+
+    server.harmony_tool_descriptions = {
+        "test_server": ToolNamespaceConfig(
+            name="test_server", description="test", tools=[tool1, tool2, tool3]
+        )
+    }
+
+    # Nonexistent server
+    assert server.get_tool_description("nonexistent") is None
+
+    # None (no filter) - returns all tools
+    result = server.get_tool_description("test_server", allowed_tools=None)
+    assert len(result.tools) == 3
+
+    # Filter to specific tools
+    result = server.get_tool_description(
+        "test_server", allowed_tools=["tool1", "tool3"]
+    )
+    assert len(result.tools) == 2
+    assert result.tools[0].name == "tool1"
+    assert result.tools[1].name == "tool3"
+
+    # Single tool
+    result = server.get_tool_description("test_server", allowed_tools=["tool2"])
+    assert len(result.tools) == 1
+    assert result.tools[0].name == "tool2"
+
+    # No matching tools - returns None
+    result = server.get_tool_description("test_server", allowed_tools=["nonexistent"])
+    assert result is None
+
+    # Empty list - returns None
+    assert server.get_tool_description("test_server", allowed_tools=[]) is None
diff --git a/tests/entrypoints/openai/test_response_api_simple.py b/tests/entrypoints/openai/test_response_api_simple.py
new file mode 100644
index 000000000000..425b8199a0fd
--- /dev/null
+++ b/tests/entrypoints/openai/test_response_api_simple.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+import pytest_asyncio
+from openai import OpenAI
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-8B"
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = ["--reasoning-parser", "qwen3", "--max_model_len", "5000"]
+    env_dict = dict(
+        VLLM_ENABLE_RESPONSES_API_STORE="1",
+        # uncomment for tool calling
+        # PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
+    )
+
+    with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_basic(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input="What is 13 * 24?",
+    )
+    assert response is not None
+    print("response: ", response)
+    assert response.status == "completed"
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+    # make sure we get a reasoning and text output
+    assert response.output[0].type == "reasoning"
+    assert response.output[1].type == "message"
+    assert type(response.output[1].content[0].text) is str
diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py
index dea8d2d28f61..8fd3545eccff 100644
--- a/tests/entrypoints/openai/test_response_api_with_harmony.py
+++ b/tests/entrypoints/openai/test_response_api_with_harmony.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import importlib
 import json
 import time
 
@@ -35,7 +35,11 @@
 
 @pytest.fixture(scope="module")
 def server():
-    args = ["--enforce-eager", "--tool-server", "demo"]
+    assert importlib.util.find_spec("gpt_oss") is not None, (
+        "Harmony tests require gpt_oss package to be installed"
+    )
+
+    args = ["--enforce-eager", "--tool-server", "demo", "--max_model_len", "5000"]
     env_dict = dict(
         VLLM_ENABLE_RESPONSES_API_STORE="1",
         PYTHON_EXECUTION_BACKEND="dangerously_use_uv",
@@ -550,6 +554,31 @@ def call_function(name, args):
         raise ValueError(f"Unknown function: {name}")
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_reasoning_item(client: OpenAI, model_name: str):
+    response = await client.responses.create(
+        model=model_name,
+        input=[
+            {"type": "message", "content": "Hello.", "role": "user"},
+            {
+                "type": "reasoning",
+                "id": "lol",
+                "content": [
+                    {
+                        "type": "reasoning_text",
+                        "text": "We need to respond: greeting.",
+                    }
+                ],
+                "summary": [],
+            },
+        ],
+        temperature=0.0,
+    )
+    assert response is not None
+    assert response.status == "completed"
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 async def test_function_calling(client: OpenAI, model_name: str):
diff --git a/tests/entrypoints/openai/test_return_tokens_as_ids.py b/tests/entrypoints/openai/test_return_tokens_as_ids.py
index adbcc1f2430c..cedf6ce16060 100644
--- a/tests/entrypoints/openai/test_return_tokens_as_ids.py
+++ b/tests/entrypoints/openai/test_return_tokens_as_ids.py
@@ -11,11 +11,11 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
-def default_server_args(zephyr_lora_files):
+def default_server_args(qwen3_lora_files):
     return [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
@@ -28,7 +28,7 @@ def default_server_args(zephyr_lora_files):
         # lora config
         "--enable-lora",
         "--lora-modules",
-        f"zephyr-lora={zephyr_lora_files}",
+        f"qwen3-lora={qwen3_lora_files}",
         "--max-lora-rank",
         "64",
         "--max-cpu-loras",
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index f951b57fe726..f6f109990e73 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -7,7 +7,7 @@
 
 import pytest
 
-from vllm.entrypoints.openai.protocol import BatchRequestOutput
+from vllm.entrypoints.openai.run_batch import BatchRequestOutput
 
 MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index dd10384a7e8c..6a1b15c4131e 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -399,7 +399,7 @@ async def _fake_process_inputs(
 @dataclass
 class MockEngine:
     model_config: MockModelConfig = field(default_factory=MockModelConfig)
-    processor: MagicMock = field(default_factory=MagicMock)
+    input_processor: MagicMock = field(default_factory=MagicMock)
     io_processor: MagicMock = field(default_factory=MagicMock)
 
 
@@ -429,7 +429,7 @@ async def test_serving_chat_returns_correct_model_name():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -459,7 +459,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -492,7 +492,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -537,7 +537,7 @@ async def test_serving_chat_should_set_correct_max_tokens():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -583,7 +583,7 @@ async def test_serving_chat_could_load_correct_generation_config():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Initialize the serving chat
@@ -629,7 +629,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = mock_model_config
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     serving_chat = _build_serving_chat(mock_engine)
@@ -662,7 +662,7 @@ async def test_serving_chat_data_parallel_rank_extraction():
     mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
     mock_engine.errored = False
     mock_engine.model_config = MockModelConfig()
-    mock_engine.processor = MagicMock()
+    mock_engine.input_processor = MagicMock()
     mock_engine.io_processor = MagicMock()
 
     # Mock the generate method to return an async generator
diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py
index 46d8871441a7..956a06dc5487 100644
--- a/tests/entrypoints/openai/test_serving_engine.py
+++ b/tests/entrypoints/openai/test_serving_engine.py
@@ -10,7 +10,7 @@
 from vllm.config import ModelConfig
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 
 @pytest.fixture()
@@ -23,7 +23,7 @@ def serving() -> OpenAIServing:
     model_config.max_model_len = 32768
     models = Mock(spec=OpenAIServingModels)
     models.model_config = model_config
-    models.processor = Mock()
+    models.input_processor = Mock()
     models.io_processor = Mock()
 
     serving = OpenAIServing(
diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py
index 3c022870dba4..b585835a0667 100644
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -30,7 +30,7 @@ async def _async_serving_models_init() -> OpenAIServingModels:
     mock_model_config = MagicMock(spec=ModelConfig)
     mock_model_config.max_model_len = 2048
     mock_engine_client.model_config = mock_model_config
-    mock_engine_client.processor = MagicMock()
+    mock_engine_client.input_processor = MagicMock()
     mock_engine_client.io_processor = MagicMock()
 
     serving_models = OpenAIServingModels(
diff --git a/tests/entrypoints/openai/test_serving_responses.py b/tests/entrypoints/openai/test_serving_responses.py
index 93e11b61020c..cf00f0a04224 100644
--- a/tests/entrypoints/openai/test_serving_responses.py
+++ b/tests/entrypoints/openai/test_serving_responses.py
@@ -17,6 +17,7 @@
 from vllm.entrypoints.openai.protocol import ErrorResponse, ResponsesRequest
 from vllm.entrypoints.openai.serving_responses import (
     OpenAIServingResponses,
+    _extract_allowed_tools_from_mcp_requests,
     extract_tool_types,
 )
 from vllm.entrypoints.tool_server import ToolServer
@@ -127,7 +128,7 @@ async def serving_responses_instance(self):
         model_config.get_diff_sampling_param.return_value = {}
         engine_client.model_config = model_config
 
-        engine_client.processor = MagicMock()
+        engine_client.input_processor = MagicMock()
         engine_client.io_processor = MagicMock()
 
         models = MagicMock()
@@ -213,7 +214,7 @@ async def serving_responses_instance(self):
         model_config.get_diff_sampling_param.return_value = {}
         engine_client.model_config = model_config
 
-        engine_client.processor = MagicMock()
+        engine_client.input_processor = MagicMock()
         engine_client.io_processor = MagicMock()
 
         models = MagicMock()
@@ -254,3 +255,98 @@ def test_validate_generator_input(self, serving_responses_instance):
         # Should return an ErrorResponse
         assert result is not None
         assert isinstance(result, ErrorResponse)
+
+
+class TestExtractAllowedToolsFromMcpRequests:
+    """Test class for _extract_allowed_tools_from_mcp_requests function"""
+
+    def test_extract_allowed_tools_basic_formats(self):
+        """Test extraction with list format, object format, and None."""
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # List format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1", "tool2"],
+            ),
+            # Object format
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=McpAllowedToolsMcpToolFilter(
+                    tool_names=["tool3", "tool4"]
+                ),
+            ),
+            # None (no filter)
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=None,
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        assert result == {
+            "server1": ["tool1", "tool2"],
+            "server2": ["tool3", "tool4"],
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_star_normalization(self):
+        """Test that '*' wildcard is normalized to None (select all tools).
+
+        This is the key test requested by reviewers to explicitly demonstrate
+        that the "*" select-all scenario is handled correctly.
+        """
+        from openai.types.responses.tool import McpAllowedToolsMcpToolFilter
+
+        tools = [
+            # Star in list format
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["*"],
+            ),
+            # Star mixed with other tools in list
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool1", "*"],
+            ),
+            # Star in object format
+            Mcp(
+                type="mcp",
+                server_label="server3",
+                allowed_tools=McpAllowedToolsMcpToolFilter(tool_names=["*"]),
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # All should be normalized to None (allows all tools)
+        assert result == {
+            "server1": None,
+            "server2": None,
+            "server3": None,
+        }
+
+    def test_extract_allowed_tools_filters_non_mcp(self):
+        """Test that non-MCP tools are ignored during extraction."""
+        tools = [
+            Mcp(
+                type="mcp",
+                server_label="server1",
+                allowed_tools=["tool1"],
+            ),
+            LocalShell(type="local_shell"),  # Non-MCP tool should be ignored
+            Mcp(
+                type="mcp",
+                server_label="server2",
+                allowed_tools=["tool2"],
+            ),
+        ]
+        result = _extract_allowed_tools_from_mcp_requests(tools)
+        # Non-MCP tools should be ignored
+        assert result == {
+            "server1": ["tool1"],
+            "server2": ["tool2"],
+        }
diff --git a/tests/entrypoints/openai/test_serving_tokens.py b/tests/entrypoints/openai/test_serving_tokens.py
new file mode 100644
index 000000000000..62d843e35b86
--- /dev/null
+++ b/tests/entrypoints/openai/test_serving_tokens.py
@@ -0,0 +1,262 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import httpx
+import pytest
+import pytest_asyncio
+from transformers import AutoTokenizer
+
+from vllm.config import ModelConfig
+from vllm.v1.engine.detokenizer import check_stop_strings
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+GEN_ENDPOINT = "/inference/v1/generate"
+
+
+def get_vocab_size(model_name):
+    config = ModelConfig(
+        model=model_name,
+        seed=0,
+        dtype="bfloat16",
+    )
+    return config.get_vocab_size()
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(MODEL_NAME)
+
+
+@pytest.fixture(scope="module")
+def messages():
+    return [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": "How many countries are in the EU?"},
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(request):
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "1024",
+        "--enforce-eager",
+    ]
+
+    extra_args = getattr(request, "param", None)
+    if extra_args is not None:
+        args = args + (
+            list(extra_args)
+            if isinstance(extra_args, (list, tuple))
+            else [str(extra_args)]
+        )
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    async with httpx.AsyncClient(
+        transport=transport,
+        base_url=server.url_root,
+        timeout=600,
+        headers=headers,
+    ) as c:
+        yield c
+
+
+@pytest.mark.asyncio
+async def test_generate_endpoint(client):
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+
+@pytest.mark.asyncio
+async def test_same_response_as_chat_completions(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    for ignore_eos in [True, False]:
+        payload = {
+            "model": MODEL_NAME,
+            "token_ids": token_ids,
+            "sampling_params": {
+                "max_tokens": 24,
+                "temperature": 0.0,
+                # NOTE coordinator will set this to skip detokenization
+                "detokenize": False,
+                "ignore_eos": ignore_eos,
+            },
+            "stream": False,
+        }
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_data = generate_resp.json()
+        generate_res = tokenizer.decode(
+            generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+        )
+
+        payload = {
+            "model": MODEL_NAME,
+            "messages": messages,
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "stream": False,
+            "ignore_eos": ignore_eos,
+            "chat_template_kwargs": dict(enable_thinking=False),
+        }
+        completions_resp = await client.post("/v1/chat/completions", json=payload)
+        completions_data = completions_resp.json()
+        completions_res = completions_data["choices"][0]["message"]["content"]
+
+        assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+async def test_stop_string_workflow(client, tokenizer, messages):
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+            # stop strings are only supported when detokenize is True.
+            "stop": ["27 member"],
+        },
+        # TODO stream test is much more interesting
+        "stream": False,
+    }
+    with pytest.raises(httpx.HTTPStatusError):
+        generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+        generate_resp.raise_for_status()
+
+    payload["sampling_params"]["stop"] = None
+    generate_resp = await client.post(
+        GEN_ENDPOINT, json=payload, headers={"X-Request-Id": "42"}
+    )
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    # NOTE This is under the responsibility of the coordinator
+    # stop_checker = StopChecker(
+    #     max_model_len=1024, get_tokenizer_for_seq=lambda _: tokenizer
+    # )
+    stop_str, truncate_to = check_stop_strings(
+        generate_res, len(generate_res), ["27 member"], False
+    )
+    assert stop_str == "27 member"
+    # abort request that hit stop string (requires tokens-only mode)
+    # res = await client.post("/abort_requests", json={"request_ids": ["generate-tokens-42"]}) # noqa: E501
+    # res.raise_for_status()
+    generate_res = generate_res[:truncate_to]
+
+    # Get stop_str response from chat completions
+    payload = {
+        "model": MODEL_NAME,
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "stop": ["27 member"],
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+    assert generate_res == completions_res
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server",
+    [
+        [
+            "--enable-lora",
+            "--lora-modules",
+            "Alice=charent/self_cognition_Alice",
+            "Bob=charent/self_cognition_Bob",
+            "--max-lora-rank",
+            "64",
+            "--max-cpu-loras",
+            "2",
+        ]
+    ],
+    indirect=True,
+)
+async def test_generate_with_lora_adapter(client, tokenizer, messages):
+    # Verify adapters are listed
+    models_resp = await client.get("/v1/models")
+    models_resp.raise_for_status()
+    models = {m["id"] for m in models_resp.json().get("data", [])}
+    assert {"Alice", "Bob"}.issubset(models)
+
+    # Generate using a LoRA adapter by specifying its name as the model
+    payload = {
+        "model": "Alice",
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 5},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+    assert "choices" in data
+
+    token_ids = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        enable_thinking=False,  # default with Qwen3
+    )
+    payload = {
+        "model": "Alice",
+        "token_ids": token_ids,
+        "sampling_params": {
+            "max_tokens": 24,
+            "temperature": 0.0,
+            "detokenize": False,
+        },
+        "stream": False,
+    }
+    generate_resp = await client.post(GEN_ENDPOINT, json=payload)
+    generate_data = generate_resp.json()
+    generate_res = tokenizer.decode(
+        generate_data["choices"][0]["token_ids"], skip_special_tokens=True
+    )
+
+    payload = {
+        "model": "Alice",
+        "messages": messages,
+        "max_tokens": 24,
+        "temperature": 0.0,
+        "stream": False,
+        "chat_template_kwargs": dict(enable_thinking=False),
+    }
+    completions_resp = await client.post("/v1/chat/completions", json=payload)
+    completions_data = completions_resp.json()
+    completions_res = completions_data["choices"][0]["message"]["content"]
+
+    assert generate_res == completions_res
diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py
index 7fd32e1c7be1..751f94319eb9 100644
--- a/tests/entrypoints/openai/test_tokenization.py
+++ b/tests/entrypoints/openai/test_tokenization.py
@@ -53,7 +53,7 @@ async def test_tokenize_completions(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_special in [False, True]:
         prompt = "vllm1 This is a test prompt."
@@ -87,7 +87,7 @@ async def test_tokenize_chat(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
@@ -140,7 +140,7 @@ async def test_tokenize_chat_with_tools(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     for add_generation in [False, True]:
         for add_special in [False, True]:
@@ -210,7 +210,7 @@ async def test_tokenize_with_return_token_strs(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     prompt = "This is a token_strs test prompt! vllm1"
     response = requests.post(
@@ -240,7 +240,7 @@ async def test_detokenize(
     model_name: str,
     tokenizer_name: str,
 ):
-    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=tokenizer_name)
 
     prompt = "This is a test prompt. vllm1"
     tokens = tokenizer.encode(prompt, add_special_tokens=False)
diff --git a/tests/entrypoints/openai/test_transcription_validation.py b/tests/entrypoints/openai/test_transcription_validation.py
index 88580ed899f1..8045ab1468d6 100644
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
@@ -2,20 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 # imports for structured outputs tests
-import io
 import json
 
-import librosa
-import numpy as np
-import openai
 import pytest
-import pytest_asyncio
-import soundfile as sf
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "openai/whisper-large-v3-turbo"
-SERVER_ARGS = ["--enforce-eager"]
 MISTRAL_FORMAT_ARGS = [
     "--tokenizer_mode",
     "mistral",
@@ -26,22 +18,8 @@
 ]
 
 
-@pytest.fixture(scope="module")
-def server():
-    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
-        yield remote_server
-
-
-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
 @pytest.mark.asyncio
-@pytest.mark.parametrize(
-    "model_name", ["openai/whisper-large-v3-turbo", "mistralai/Voxtral-Mini-3B-2507"]
-)
+@pytest.mark.parametrize("model_name", ["mistralai/Voxtral-Mini-3B-2507"])
 async def test_basic_audio(mary_had_lamb, model_name):
     server_args = ["--enforce-eager"]
 
@@ -120,176 +98,3 @@ async def test_basic_audio_gemma(foscolo):
         )
         out = json.loads(transcription)["text"]
         assert "da cui vergine nacque Venere" in out
-
-
-@pytest.mark.asyncio
-async def test_non_asr_model(winning_call):
-    # text to text model
-    model_name = "JackFram/llama-68m"
-    with RemoteOpenAIServer(model_name, SERVER_ARGS) as remote_server:
-        client = remote_server.get_async_client()
-        res = await client.audio.transcriptions.create(
-            model=model_name, file=winning_call, language="en", temperature=0.0
-        )
-        err = res.error
-        assert err["code"] == 400 and not res.text
-        assert err["message"] == "The model does not support Transcriptions API"
-
-
-@pytest.mark.asyncio
-async def test_bad_requests(mary_had_lamb, client):
-    # invalid language
-    with pytest.raises(openai.BadRequestError):
-        await client.audio.transcriptions.create(
-            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
-        )
-
-
-@pytest.mark.asyncio
-async def test_long_audio_request(mary_had_lamb, client):
-    mary_had_lamb.seek(0)
-    audio, sr = librosa.load(mary_had_lamb)
-    # Add small silence after each audio for repeatability in the split process
-    audio = np.pad(audio, (0, 1600))
-    repeated_audio = np.tile(audio, 10)
-    # Repeated audio to buffer
-    buffer = io.BytesIO()
-    sf.write(buffer, repeated_audio, sr, format="WAV")
-    buffer.seek(0)
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=buffer,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)
-    out_text = out["text"]
-    out_usage = out["usage"]
-    counts = out_text.count("Mary had a little lamb")
-    assert counts == 10, counts
-    assert out_usage["seconds"] == 161, out_usage["seconds"]
-
-
-@pytest.mark.asyncio
-async def test_completion_endpoints(client):
-    # text to text model
-    res = await client.chat.completions.create(
-        model=MODEL_NAME,
-        messages=[{"role": "system", "content": "You are a helpful assistant."}],
-    )
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Chat Completions API"
-
-    res = await client.completions.create(model=MODEL_NAME, prompt="Hello")
-    err = res.error
-    assert err["code"] == 400
-    assert err["message"] == "The model does not support Completions API"
-
-
-@pytest.mark.asyncio
-async def test_streaming_response(winning_call, client):
-    transcription = ""
-    res_no_stream = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        response_format="json",
-        language="en",
-        temperature=0.0,
-    )
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        timeout=30,
-    )
-    # Reconstruct from chunks and validate
-    async for chunk in res:
-        text = chunk.choices[0]["delta"]["content"]
-        transcription += text
-
-    assert transcription == res_no_stream.text
-
-
-@pytest.mark.asyncio
-async def test_stream_options(winning_call, client):
-    res = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=winning_call,
-        language="en",
-        temperature=0.0,
-        stream=True,
-        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
-        timeout=30,
-    )
-    final = False
-    continuous = True
-    async for chunk in res:
-        if not len(chunk.choices):
-            # final usage sent
-            final = True
-        else:
-            continuous = continuous and hasattr(chunk, "usage")
-    assert final and continuous
-
-
-@pytest.mark.asyncio
-async def test_sampling_params(mary_had_lamb, client):
-    """
-    Compare sampling with params and greedy sampling to assert results
-    are different when extreme sampling parameters values are picked.
-    """
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.8,
-        extra_body=dict(
-            seed=42,
-            repetition_penalty=1.9,
-            top_k=12,
-            top_p=0.4,
-            min_p=0.5,
-            frequency_penalty=1.8,
-            presence_penalty=2.0,
-        ),
-    )
-
-    greedy_transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        temperature=0.0,
-        extra_body=dict(seed=42),
-    )
-
-    assert greedy_transcription.text != transcription.text
-
-
-@pytest.mark.asyncio
-async def test_audio_prompt(mary_had_lamb, client):
-    prompt = "This is a speech, recorded in a phonograph."
-    # Prompts should not omit the part of original prompt while transcribing.
-    prefix = "The first words I spoke in the original phonograph"
-    transcription = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        temperature=0.0,
-    )
-    out = json.loads(transcription)["text"]
-    assert prefix in out
-    transcription_wprompt = await client.audio.transcriptions.create(
-        model=MODEL_NAME,
-        file=mary_had_lamb,
-        language="en",
-        response_format="text",
-        prompt=prompt,
-        temperature=0.0,
-    )
-    out_prompt = json.loads(transcription_wprompt)["text"]
-    assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_transcription_validation_whisper.py b/tests/entrypoints/openai/test_transcription_validation_whisper.py
new file mode 100644
index 000000000000..82c50e58a016
--- /dev/null
+++ b/tests/entrypoints/openai/test_transcription_validation_whisper.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# imports for structured outputs tests
+import asyncio
+import io
+import json
+
+import librosa
+import numpy as np
+import openai
+import pytest
+import pytest_asyncio
+import soundfile as sf
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/whisper-large-v3-turbo"
+SERVER_ARGS = ["--enforce-eager"]
+
+
+@pytest.fixture(scope="module")
+def server():
+    with RemoteOpenAIServer(MODEL_NAME, SERVER_ARGS) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def whisper_client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_basic_audio(mary_had_lamb):
+    server_args = ["--enforce-eager"]
+
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    with RemoteOpenAIServer(MODEL_NAME, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=MODEL_NAME,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0,
+        )
+        out = json.loads(transcription)
+        out_text = out["text"]
+        out_usage = out["usage"]
+        assert "Mary had a little lamb," in out_text
+        assert out_usage["seconds"] == 16, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_basic_audio_batched(mary_had_lamb, winning_call, whisper_client):
+    transcription = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    transcription2 = whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    # Await both transcriptions by scheduling coroutines together
+    transcription, transcription2 = await asyncio.gather(transcription, transcription2)
+    out = json.loads(transcription)
+    out_text = out["text"]
+    assert "Mary had a little lamb," in out_text
+    out2 = json.loads(transcription2)
+    out_text2 = out2["text"]
+    assert "Edgar Martinez" in out_text2
+
+
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb, whisper_client):
+    # invalid language
+    with pytest.raises(openai.BadRequestError):
+        await whisper_client.audio.transcriptions.create(
+            model=MODEL_NAME, file=mary_had_lamb, language="hh", temperature=0.0
+        )
+
+
+@pytest.mark.asyncio
+async def test_long_audio_request(mary_had_lamb, whisper_client):
+    mary_had_lamb.seek(0)
+    audio, sr = librosa.load(mary_had_lamb)
+    # Add small silence after each audio for repeatability in the split process
+    audio = np.pad(audio, (0, 1600))
+    repeated_audio = np.tile(audio, 10)
+    # Repeated audio to buffer
+    buffer = io.BytesIO()
+    sf.write(buffer, repeated_audio, sr, format="WAV")
+    buffer.seek(0)
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=buffer,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)
+    out_text = out["text"]
+    out_usage = out["usage"]
+    counts = out_text.count("Mary had a little lamb")
+    assert counts == 10, counts
+    assert out_usage["seconds"] == 161, out_usage["seconds"]
+
+
+@pytest.mark.asyncio
+async def test_completion_endpoints(whisper_client):
+    # text to text model
+    res = await whisper_client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{"role": "system", "content": "You are a helpful assistant."}],
+    )
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Chat Completions API"
+
+    res = await whisper_client.completions.create(model=MODEL_NAME, prompt="Hello")
+    err = res.error
+    assert err["code"] == 400
+    assert err["message"] == "The model does not support Completions API"
+
+
+@pytest.mark.asyncio
+async def test_streaming_response(winning_call, whisper_client):
+    transcription = ""
+    res_no_stream = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        response_format="json",
+        language="en",
+        temperature=0.0,
+    )
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        timeout=30,
+    )
+    # Reconstruct from chunks and validate
+    async for chunk in res:
+        text = chunk.choices[0]["delta"]["content"]
+        transcription += text
+
+    assert transcription == res_no_stream.text
+
+
+@pytest.mark.asyncio
+async def test_stream_options(winning_call, whisper_client):
+    res = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=winning_call,
+        language="en",
+        temperature=0.0,
+        stream=True,
+        extra_body=dict(stream_include_usage=True, stream_continuous_usage_stats=True),
+        timeout=30,
+    )
+    final = False
+    continuous = True
+    async for chunk in res:
+        if not len(chunk.choices):
+            # final usage sent
+            final = True
+        else:
+            continuous = continuous and hasattr(chunk, "usage")
+    assert final and continuous
+
+
+@pytest.mark.asyncio
+async def test_sampling_params(mary_had_lamb, whisper_client):
+    """
+    Compare sampling with params and greedy sampling to assert results
+    are different when extreme sampling parameters values are picked.
+    """
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.8,
+        extra_body=dict(
+            seed=42,
+            repetition_penalty=1.9,
+            top_k=12,
+            top_p=0.4,
+            min_p=0.5,
+            frequency_penalty=1.8,
+            presence_penalty=2.0,
+        ),
+    )
+
+    greedy_transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        temperature=0.0,
+        extra_body=dict(seed=42),
+    )
+
+    assert greedy_transcription.text != transcription.text
+
+
+@pytest.mark.asyncio
+async def test_audio_prompt(mary_had_lamb, whisper_client):
+    prompt = "This is a speech, recorded in a phonograph."
+    # Prompts should not omit the part of original prompt while transcribing.
+    prefix = "The first words I spoke in the original phonograph"
+    transcription = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        temperature=0.0,
+    )
+    out = json.loads(transcription)["text"]
+    assert prefix in out
+    transcription_wprompt = await whisper_client.audio.transcriptions.create(
+        model=MODEL_NAME,
+        file=mary_had_lamb,
+        language="en",
+        response_format="text",
+        prompt=prompt,
+        temperature=0.0,
+    )
+    out_prompt = json.loads(transcription_wprompt)["text"]
+    assert prefix in out_prompt
diff --git a/tests/entrypoints/openai/test_uds.py b/tests/entrypoints/openai/test_uds.py
index 5c39869a794f..c79a4870dea3 100644
--- a/tests/entrypoints/openai/test_uds.py
+++ b/tests/entrypoints/openai/test_uds.py
@@ -10,7 +10,7 @@
 
 from ...utils import RemoteOpenAIServer
 
-MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
 @pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 2a7df08ea3b0..d83c6726e72d 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -17,10 +17,10 @@
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 EXPECTED_MM_BEAM_SEARCH_RES = [
diff --git a/tests/entrypoints/openai/tool_parsers/conftest.py b/tests/entrypoints/openai/tool_parsers/conftest.py
index f2ac5e5b9a8f..a40d0ab44cf7 100644
--- a/tests/entrypoints/openai/tool_parsers/conftest.py
+++ b/tests/entrypoints/openai/tool_parsers/conftest.py
@@ -4,9 +4,9 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 @pytest.fixture(scope="function")
-def default_tokenizer() -> AnyTokenizer:
+def default_tokenizer() -> TokenizerLike:
     return AutoTokenizer.from_pretrained("gpt2")
diff --git a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
index 38008dafe32b..b2303ab0e7b7 100644
--- a/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py
@@ -7,7 +7,7 @@
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from ....utils import RemoteOpenAIServer
 
@@ -270,14 +270,14 @@ async def test_streaming_product_tool_call():
 
 
 @pytest.fixture
-def qwen_tokenizer() -> AnyTokenizer:
+def qwen_tokenizer() -> TokenizerLike:
     from vllm.transformers_utils.tokenizer import get_tokenizer
 
     return get_tokenizer("Qwen/Qwen3-32B")
 
 
 @pytest.fixture
-def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser:
+def hermes_parser(qwen_tokenizer: TokenizerLike) -> Hermes2ProToolParser:
     return Hermes2ProToolParser(qwen_tokenizer)
 
 
@@ -291,7 +291,7 @@ def any_chat_request() -> ChatCompletionRequest:
 
 
 def test_hermes_parser_streaming_just_forward_text(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
@@ -323,7 +323,7 @@ def test_hermes_parser_streaming_just_forward_text(
 
 
 def test_hermes_parser_streaming_failure_case_bug_19056(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
@@ -357,7 +357,7 @@ def test_hermes_parser_streaming_failure_case_bug_19056(
 
 
 def test_hermes_parser_streaming(
-    qwen_tokenizer: AnyTokenizer,
+    qwen_tokenizer: TokenizerLike,
     hermes_parser: Hermes2ProToolParser,
     any_chat_request: ChatCompletionRequest,
 ) -> None:
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
index 2b68a653f460..6c286ca90ce4 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama3_json_tool_parser.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from unittest.mock import MagicMock, patch
+
 import pytest
 
 from vllm.entrypoints.openai.protocol import ExtractedToolCallInformation
 from vllm.entrypoints.openai.tool_parsers.llama_tool_parser import Llama3JsonToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 @pytest.fixture
-def parser(default_tokenizer: AnyTokenizer):
+def parser(default_tokenizer: TokenizerLike):
     return Llama3JsonToolParser(default_tokenizer)
 
 
@@ -132,3 +134,129 @@ def test_extract_tool_calls_multiple_json_with_surrounding_text(parser):
     assert result.tool_calls[0].function.name == "searchTool"
     assert result.tool_calls[1].function.name == "getOpenIncidentsTool"
     assert result.tool_calls[2].function.name == "searchTool"
+
+
+def test_extract_tool_calls_deeply_nested_json(parser):
+    # Test with deeply nested JSON parameters (5 levels)
+    model_output = (
+        '{"name": "complexTool", '
+        '"parameters": {'
+        '"level1": {'
+        '"level2": {'
+        '"level3": {'
+        '"level4": {'
+        '"value": "deep"'
+        "}}}}}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "complexTool"
+    # Verify the nested structure is preserved in the arguments
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["level1"]["level2"]["level3"]["level4"]["value"] == "deep"
+
+
+def test_extract_tool_calls_multiple_with_deep_nesting(parser):
+    # Test with multiple tool calls where some have deeply nested parameters
+    model_output = (
+        '{"name": "simpleTool", "parameters": {"value": "test"}}; '
+        '{"name": "complexTool", "parameters": '
+        '{"config": {"database": {"connection": {"pool": {"size": 10}}}}}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 2
+
+    # Check first tool call
+    assert result.tool_calls[0].function.name == "simpleTool"
+    import json
+
+    args0 = json.loads(result.tool_calls[0].function.arguments)
+    assert args0["value"] == "test"
+
+    # Check second tool call with deep nesting
+    assert result.tool_calls[1].function.name == "complexTool"
+    args1 = json.loads(result.tool_calls[1].function.arguments)
+    assert args1["config"]["database"]["connection"]["pool"]["size"] == 10
+
+
+def test_extract_tool_calls_with_quotes_and_brackets_in_string(parser):
+    # Test with quotes and brackets inside quoted string values
+    model_output = (
+        '{"name": "searchTool", '
+        '"parameters": {'
+        '"query": "test {value} [complex]",'
+        '"nested": {"inner": "more {brackets}"}'
+        "}}"
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "searchTool"
+    # Verify the string values are preserved including brackets and quotes
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["query"] == "test {value} [complex]"
+    assert args["nested"]["inner"] == "more {brackets}"
+
+
+def test_extract_tool_calls_with_escaped_quotes_in_nested_json(parser):
+    # Test with escaped quotes in deeply nested JSON
+    model_output = (
+        '{"name": "parserTool", "parameters": {"text": "He said \\"Hello {world}\\""}}'
+    )
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is True
+    assert len(result.tool_calls) == 1
+    assert result.tool_calls[0].function.name == "parserTool"
+    # Verify escaped quotes are preserved
+    import json
+
+    args = json.loads(result.tool_calls[0].function.arguments)
+    assert args["text"] == 'He said "Hello {world}"'
+
+
+def test_extract_tool_calls_missing_name_key(parser):
+    # Test that missing "name" key returns content
+    model_output = '{"parameters": {}}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_extract_tool_calls_missing_parameters_and_arguments_key(parser):
+    # Test that missing both "parameters" and "arguments" keys returns content
+    model_output = '{"name": "toolWithoutParams"}'
+    result = parser.extract_tool_calls(model_output, None)
+
+    assert result.tools_called is False
+    assert len(result.tool_calls) == 0
+    assert result.content == model_output
+
+
+def test_regex_timeout_handling(parser):
+    """Test regex timeout is handled gracefully"""
+    fake_problematic_input = "{hello world[A(A=" + "\t)A(A=,\t" * 2
+
+    # create a mock regex that raises TimeoutError
+    mock_regex = MagicMock()
+    mock_regex.finditer.side_effect = TimeoutError("Regex timeout")
+
+    with patch.object(parser, "tool_call_start_regex", mock_regex):
+        result = parser.extract_tool_calls(fake_problematic_input, None)
+
+        # should treat as regular text when regex times out
+        assert result.content == fake_problematic_input
+        assert result.tools_called is False
+        assert len(result.tool_calls) == 0
+        mock_regex.finditer.assert_called_once()
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
index d297432eab64..8aa88a007188 100644
--- a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -11,7 +11,7 @@
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # Test cases similar to pythonic parser but with Llama4 specific format
 SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
@@ -64,7 +64,7 @@
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
     )
@@ -208,7 +208,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
@@ -224,7 +224,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
     )
@@ -246,7 +246,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("llama4_pythonic")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
index 13cff9a8ebf1..a0b9a3c563bc 100644
--- a/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_olmo3_tool_parser.py
@@ -11,7 +11,7 @@
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -69,7 +69,7 @@
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
     )
@@ -188,7 +188,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
@@ -205,7 +205,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
     )
@@ -228,7 +228,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("olmo3")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
index fcd3df16e5cf..52202c55e840 100644
--- a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -11,7 +11,7 @@
 )
 from vllm.entrypoints.openai.protocol import FunctionCall
 from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 # https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
 SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
@@ -61,7 +61,7 @@
 
 
 @pytest.mark.parametrize("streaming", [True, False])
-def test_no_tool_call(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_no_tool_call(streaming: bool, default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
     )
@@ -168,7 +168,7 @@ def test_tool_call(
     streaming: bool,
     model_output: str,
     expected_tool_calls: list[FunctionCall],
-    default_tokenizer: AnyTokenizer,
+    default_tokenizer: TokenizerLike,
 ):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
@@ -185,7 +185,7 @@ def test_tool_call(
         assert actual.function == expected
 
 
-def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
+def test_streaming_tool_call_with_large_steps(default_tokenizer: TokenizerLike):
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
     )
@@ -208,7 +208,7 @@ def test_streaming_tool_call_with_large_steps(default_tokenizer: AnyTokenizer):
 
 
 @pytest.mark.parametrize("streaming", [False])
-def test_regex_timeout_handling(streaming: bool, default_tokenizer: AnyTokenizer):
+def test_regex_timeout_handling(streaming: bool, default_tokenizer: TokenizerLike):
     """test regex timeout is handled gracefully"""
     tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
         default_tokenizer
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
index 38899f263255..2d4f5f173410 100644
--- a/tests/entrypoints/openai/tool_parsers/utils.py
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -11,7 +11,7 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers import ToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 class StreamingToolReconstructor:
@@ -111,7 +111,7 @@ def run_tool_extraction_nonstreaming(
     return tool_parser.extract_tool_calls(model_output, request)
 
 
-def split_string_into_token_deltas(tokenizer: AnyTokenizer, text: str) -> list[str]:
+def split_string_into_token_deltas(tokenizer: TokenizerLike, text: str) -> list[str]:
     # Split a string into a series of deltas using the provided tokenizer. Each
     # delta will be the string equivalent of a single token.
     token_ids = tokenizer.encode(text, add_special_tokens=False)
diff --git a/tests/entrypoints/pooling/llm/__init__.py b/tests/entrypoints/pooling/basic/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/__init__.py
rename to tests/entrypoints/pooling/basic/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_encode.py b/tests/entrypoints/pooling/basic/test_encode.py
similarity index 92%
rename from tests/entrypoints/pooling/llm/test_encode.py
rename to tests/entrypoints/pooling/basic/test_encode.py
index ca85d2758fce..f86ecef2e474 100644
--- a/tests/entrypoints/pooling/llm/test_encode.py
+++ b/tests/entrypoints/pooling/basic/test_encode.py
@@ -7,6 +7,12 @@
 
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
diff --git a/tests/entrypoints/pooling/openai/test_truncation.py b/tests/entrypoints/pooling/basic/test_truncation.py
similarity index 95%
rename from tests/entrypoints/pooling/openai/test_truncation.py
rename to tests/entrypoints/pooling/basic/test_truncation.py
index 6889628dc914..0d2d38584040 100644
--- a/tests/entrypoints/pooling/openai/test_truncation.py
+++ b/tests/entrypoints/pooling/basic/test_truncation.py
@@ -7,6 +7,12 @@
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "sentence-transformers/all-MiniLM-L12-v2"
 max_model_len = 128
diff --git a/tests/entrypoints/pooling/openai/__init__.py b/tests/entrypoints/pooling/classify/__init__.py
similarity index 100%
rename from tests/entrypoints/pooling/openai/__init__.py
rename to tests/entrypoints/pooling/classify/__init__.py
diff --git a/tests/entrypoints/pooling/llm/test_classify.py b/tests/entrypoints/pooling/classify/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_classify.py
rename to tests/entrypoints/pooling/classify/test_offline.py
diff --git a/tests/entrypoints/pooling/openai/test_classification.py b/tests/entrypoints/pooling/classify/test_online.py
similarity index 94%
rename from tests/entrypoints/pooling/openai/test_classification.py
rename to tests/entrypoints/pooling/classify/test_online.py
index 671bb948780a..6fef68858695 100644
--- a/tests/entrypoints/pooling/openai/test_classification.py
+++ b/tests/entrypoints/pooling/classify/test_online.py
@@ -7,7 +7,8 @@
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ClassificationResponse, PoolingResponse
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 
 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
 DTYPE = "float32"  # Use float32 to avoid NaN issue
@@ -46,6 +47,16 @@ def test_single_input_classification(server: RemoteOpenAIServer, model_name: str
     assert hasattr(output.data[0], "probs")
 
 
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_add_special_tokens_false(server: RemoteOpenAIServer, model_name: str):
+    response = requests.post(
+        server.url_for("classify"),
+        json={"model": model_name, "input": "hello", "add_special_tokens": False},
+    )
+    response.raise_for_status()
+    ClassificationResponse.model_validate(response.json())
+
+
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
 def test_multiple_inputs_classification(server: RemoteOpenAIServer, model_name: str):
     input_texts = [
diff --git a/tests/entrypoints/pooling/classify/test_online_vision.py b/tests/entrypoints/pooling/classify/test_online_vision.py
new file mode 100644
index 000000000000..aeb05c64d190
--- /dev/null
+++ b/tests/entrypoints/pooling/classify/test_online_vision.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import requests
+
+from tests.utils import RemoteOpenAIServer
+from vllm.entrypoints.pooling.classify.protocol import ClassificationResponse
+
+VLM_MODEL_NAME = "muziyongshixin/Qwen2.5-VL-7B-for-VideoCls"
+MAXIMUM_VIDEOS = 1
+TEST_VIDEO_URL = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+HF_OVERRIDES = {
+    "text_config": {
+        "architectures": ["Qwen2_5_VLForSequenceClassification"],
+    },
+}
+
+
+@pytest.fixture(scope="module")
+def server_vlm_classify():
+    args = [
+        "--runner",
+        "pooling",
+        "--max-model-len",
+        "5000",
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"video": MAXIMUM_VIDEOS}),
+    ]
+
+    with RemoteOpenAIServer(
+        VLM_MODEL_NAME, args, override_hf_configs=HF_OVERRIDES
+    ) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_text_only(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this text request."},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 22
+
+
+@pytest.mark.parametrize("model_name", [VLM_MODEL_NAME])
+def test_classify_accepts_chat_video_url(
+    server_vlm_classify: RemoteOpenAIServer, model_name: str
+) -> None:
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": TEST_VIDEO_URL}},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server_vlm_classify.url_for("classify"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = ClassificationResponse.model_validate(response.json())
+
+    assert output.object == "list"
+    assert output.model == model_name
+    assert len(output.data) == 1
+    assert len(output.data[0].probs) == 2
+    assert output.usage.prompt_tokens == 4807
diff --git a/tests/tokenization/__init__.py b/tests/entrypoints/pooling/embed/__init__.py
similarity index 100%
rename from tests/tokenization/__init__.py
rename to tests/entrypoints/pooling/embed/__init__.py
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_embed.py b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
similarity index 87%
rename from tests/entrypoints/pooling/correctness/test_mteb_embed.py
rename to tests/entrypoints/pooling/embed/test_correctness_mteb.py
index 7f16638e51e2..64673534fd32 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_embed.py
+++ b/tests/entrypoints/pooling/embed/test_correctness_mteb.py
@@ -11,6 +11,12 @@
     run_mteb_embed_task,
 )
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
diff --git a/tests/entrypoints/pooling/llm/test_embedding.py b/tests/entrypoints/pooling/embed/test_offline.py
similarity index 90%
rename from tests/entrypoints/pooling/llm/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_offline.py
index 5455b5f91fc0..f5eab4c29ae1 100644
--- a/tests/entrypoints/pooling/llm/test_embedding.py
+++ b/tests/entrypoints/pooling/embed/test_offline.py
@@ -9,6 +9,12 @@
 
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "intfloat/multilingual-e5-small"
 
diff --git a/tests/entrypoints/pooling/openai/test_embedding.py b/tests/entrypoints/pooling/embed/test_online.py
similarity index 98%
rename from tests/entrypoints/pooling/openai/test_embedding.py
rename to tests/entrypoints/pooling/embed/test_online.py
index e971b23e8f1a..6aac649bc303 100644
--- a/tests/entrypoints/pooling/openai/test_embedding.py
+++ b/tests/entrypoints/pooling/embed/test_online.py
@@ -15,10 +15,9 @@
 from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import (
-    EmbeddingResponse,
-    PoolingResponse,
-)
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
@@ -28,6 +27,11 @@
     decode_pooling_output,
 )
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
+
 MODEL_NAME = "intfloat/multilingual-e5-small"
 DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
 DTYPE = "bfloat16"
@@ -193,7 +197,7 @@ async def test_conversation_embedding(
     chat_response.raise_for_status()
     chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())
 
-    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    tokenizer = get_tokenizer(tokenizer_name=model_name)
     prompt = tokenizer.apply_chat_template(
         messages,
         chat_template=DUMMY_CHAT_TEMPLATE,
diff --git a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py b/tests/entrypoints/pooling/embed/test_online_dimensions.py
similarity index 93%
rename from tests/entrypoints/pooling/openai/test_embedding_dimensions.py
rename to tests/entrypoints/pooling/embed/test_online_dimensions.py
index ba9fb6426277..26aa57742b02 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_dimensions.py
+++ b/tests/entrypoints/pooling/embed/test_online_dimensions.py
@@ -11,7 +11,13 @@
 from tests.models.language.pooling.embed_utils import run_embedding_correctness_test
 from tests.models.utils import EmbedModelInfo
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODELS = [
     EmbedModelInfo("intfloat/multilingual-e5-small", is_matryoshka=False),
diff --git a/tests/entrypoints/pooling/openai/test_embedding_long_text.py b/tests/entrypoints/pooling/embed/test_online_long_text.py
similarity index 98%
rename from tests/entrypoints/pooling/openai/test_embedding_long_text.py
rename to tests/entrypoints/pooling/embed/test_online_long_text.py
index f977c81a9084..0be7eebc2017 100644
--- a/tests/entrypoints/pooling/openai/test_embedding_long_text.py
+++ b/tests/entrypoints/pooling/embed/test_online_long_text.py
@@ -15,7 +15,13 @@
 import pytest_asyncio
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 
 def _generate_random_text(word_count: int) -> str:
diff --git a/tests/entrypoints/pooling/openai/test_vision_embedding.py b/tests/entrypoints/pooling/embed/test_online_vision.py
similarity index 82%
rename from tests/entrypoints/pooling/openai/test_vision_embedding.py
rename to tests/entrypoints/pooling/embed/test_online_vision.py
index 944392d66fa5..83e7048b9def 100644
--- a/tests/entrypoints/pooling/openai/test_vision_embedding.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -8,7 +8,7 @@
 from transformers import AutoProcessor
 
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import EmbeddingResponse
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image
 
 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
@@ -19,10 +19,10 @@
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 
diff --git a/tests/entrypoints/pooling/pooling/__init__.py b/tests/entrypoints/pooling/pooling/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/pooling/openai/test_pooling.py b/tests/entrypoints/pooling/pooling/test_online.py
similarity index 98%
rename from tests/entrypoints/pooling/openai/test_pooling.py
rename to tests/entrypoints/pooling/pooling/test_online.py
index 4b20c5b0fa84..977c74d54a35 100644
--- a/tests/entrypoints/pooling/openai/test_pooling.py
+++ b/tests/entrypoints/pooling/pooling/test_online.py
@@ -11,7 +11,7 @@
 
 from tests.models.utils import check_embeddings_close
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import PoolingResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.serial_utils import (
     EMBED_DTYPE_TO_TORCH_DTYPE,
@@ -158,11 +158,7 @@ async def test_conversation_pooling(server: RemoteOpenAIServer, model_name: str)
     chat_response.raise_for_status()
     chat_poolings = PoolingResponse.model_validate(chat_response.json())
 
-    tokenizer = get_tokenizer(
-        tokenizer_name=model_name,
-        tokenizer_mode="fast",
-        trust_remote_code=True,
-    )
+    tokenizer = get_tokenizer(tokenizer_name=model_name, trust_remote_code=True)
     prompt = tokenizer.apply_chat_template(
         messages,
         chat_template=DUMMY_CHAT_TEMPLATE,
diff --git a/tests/entrypoints/pooling/reward/__init__.py b/tests/entrypoints/pooling/reward/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/pooling/llm/test_reward.py b/tests/entrypoints/pooling/reward/test_offline.py
similarity index 100%
rename from tests/entrypoints/pooling/llm/test_reward.py
rename to tests/entrypoints/pooling/reward/test_offline.py
diff --git a/tests/entrypoints/pooling/score/__init__.py b/tests/entrypoints/pooling/score/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/entrypoints/pooling/correctness/test_mteb_score.py b/tests/entrypoints/pooling/score/test_correctness_mteb.py
similarity index 91%
rename from tests/entrypoints/pooling/correctness/test_mteb_score.py
rename to tests/entrypoints/pooling/score/test_correctness_mteb.py
index 1afe68b189db..81ad0097187b 100644
--- a/tests/entrypoints/pooling/correctness/test_mteb_score.py
+++ b/tests/entrypoints/pooling/score/test_correctness_mteb.py
@@ -13,6 +13,12 @@
     run_mteb_rerank,
 )
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
 
diff --git a/tests/entrypoints/pooling/llm/test_score.py b/tests/entrypoints/pooling/score/test_offline.py
similarity index 90%
rename from tests/entrypoints/pooling/llm/test_score.py
rename to tests/entrypoints/pooling/score/test_offline.py
index b69c6a47c191..ce36d61cb847 100644
--- a/tests/entrypoints/pooling/llm/test_score.py
+++ b/tests/entrypoints/pooling/score/test_offline.py
@@ -9,6 +9,12 @@
 from tests.models.utils import softmax
 from vllm import LLM, PoolingParams
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
 
diff --git a/tests/entrypoints/pooling/openai/test_rerank.py b/tests/entrypoints/pooling/score/test_online_rerank.py
similarity index 95%
rename from tests/entrypoints/pooling/openai/test_rerank.py
rename to tests/entrypoints/pooling/score/test_online_rerank.py
index 1d85190c12a1..f262dd4cb06b 100644
--- a/tests/entrypoints/pooling/openai/test_rerank.py
+++ b/tests/entrypoints/pooling/score/test_online_rerank.py
@@ -7,7 +7,14 @@
 import torch.nn.functional as F
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import PoolingResponse, RerankResponse
+from vllm.entrypoints.pooling.pooling.protocol import PoolingResponse
+from vllm.entrypoints.pooling.score.protocol import RerankResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODEL_NAME = "BAAI/bge-reranker-base"
 DTYPE = "bfloat16"
diff --git a/tests/entrypoints/pooling/openai/test_score.py b/tests/entrypoints/pooling/score/test_online_score.py
similarity index 97%
rename from tests/entrypoints/pooling/openai/test_score.py
rename to tests/entrypoints/pooling/score/test_online_score.py
index b8f796d47efa..30ef55c8b675 100644
--- a/tests/entrypoints/pooling/openai/test_score.py
+++ b/tests/entrypoints/pooling/score/test_online_score.py
@@ -9,7 +9,13 @@
 from torch import tensor
 
 from tests.utils import RemoteOpenAIServer
-from vllm.entrypoints.openai.protocol import ScoreResponse
+from vllm.entrypoints.pooling.score.protocol import ScoreResponse
+from vllm.platforms import current_platform
+
+if current_platform.is_rocm():
+    pytest.skip(
+        "Encoder self-attention is not implemented on ROCm.", allow_module_level=True
+    )
 
 MODELS = [
     {"name": "BAAI/bge-reranker-v2-m3", "is_cross_encoder": True},
diff --git a/tests/entrypoints/sagemaker/conftest.py b/tests/entrypoints/sagemaker/conftest.py
index 4c859c2527d2..ad219eec18b7 100644
--- a/tests/entrypoints/sagemaker/conftest.py
+++ b/tests/entrypoints/sagemaker/conftest.py
@@ -9,7 +9,6 @@
 from ...utils import RemoteOpenAIServer
 
 # Model name constants used across tests
-MODEL_NAME_ZEPHYR = "HuggingFaceH4/zephyr-7b-beta"
 MODEL_NAME_SMOLLM = "HuggingFaceTB/SmolLM2-135M-Instruct"
 LORA_ADAPTER_NAME_SMOLLM = "jekunz/smollm-135m-lora-fineweb-faroese"
 
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index ca87b3e76b3f..a351cda60621 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -28,8 +28,8 @@
     encode_image_base64,
     encode_video_base64,
 )
+from vllm.tokenizers import MistralTokenizer
 from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import VLLM_PATH
@@ -103,6 +103,19 @@ def qwen2_audio_model_config():
     )
 
 
+@pytest.fixture(scope="function")
+def audio_embeds_model_config():
+    return ModelConfig(
+        QWEN2AUDIO_MODEL_ID,
+        runner="generate",
+        trust_remote_code=True,
+        limit_mm_per_prompt={
+            "audio": 2,
+        },
+        enable_mm_embeds=True,
+    )
+
+
 @pytest.fixture(scope="module")
 def qwen2_audio_tokenizer():
     return get_tokenizer(QWEN2AUDIO_MODEL_ID)
@@ -843,6 +856,138 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[uuid])
 
 
+def test_parse_chat_messages_empty_audio_embeds_with_uuid(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with UUID (no actual embeds data)."""
+    uuid = "test-audio-uuid-123"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {"type": "audio_embeds", "audio_embeds": None, "uuid": uuid},
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio in mm_data as None (UUID provided)
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert mm_data["audio"] is None
+    # UUID should be recorded
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[uuid])
+
+
+def test_parse_chat_messages_audio_embeds_with_string(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with base64 string embedding data."""
+    import base64
+    import io
+
+    import torch
+
+    # Create a sample audio embedding tensor
+    audio_embedding = torch.randn(1, 128, 768)
+
+    # Encode it as base64
+    buffer = io.BytesIO()
+    torch.save(audio_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {
+                        "type": "audio_embeds",
+                        "audio_embeds": base64_audio_embedding,
+                    },
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio embedding in mm_data (single tensor, not a list)
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert isinstance(mm_data["audio"], torch.Tensor)
+    assert mm_data["audio"].shape == audio_embedding.shape
+    # No UUID provided
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_audio_embeds_async(
+    audio_embeds_model_config,
+    qwen2_audio_tokenizer,
+):
+    """Test audio_embeds with async futures."""
+    import base64
+    import io
+
+    import torch
+
+    # Create a sample audio embedding tensor
+    audio_embedding = torch.randn(1, 128, 768)
+
+    # Encode it as base64
+    buffer = io.BytesIO()
+    torch.save(audio_embedding, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    base64_audio_embedding = base64.b64encode(binary_data).decode("utf-8")
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this audio"},
+                    {
+                        "type": "audio_embeds",
+                        "audio_embeds": base64_audio_embedding,
+                    },
+                ],
+            }
+        ],
+        audio_embeds_model_config,
+        qwen2_audio_tokenizer,
+        content_format="string",
+    )
+
+    # Should have audio embedding in mm_data (single tensor, not a list)
+    mm_data = await mm_future
+    assert mm_data is not None
+    assert "audio" in mm_data
+    assert isinstance(mm_data["audio"], torch.Tensor)
+    assert mm_data["audio"].shape == audio_embedding.shape
+    # No UUID provided
+    assert mm_uuids is not None
+    assert "audio" in mm_uuids
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
 @pytest.mark.asyncio
 async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
     phi3v_model_config_image_embeds,
diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py
index 48bf06088bc0..893d806b6574 100644
--- a/tests/entrypoints/test_responses_utils.py
+++ b/tests/entrypoints/test_responses_utils.py
@@ -1,7 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
+from openai.types.responses.response_reasoning_item import (
+    Content,
+    ResponseReasoningItem,
+    Summary,
+)
+
 from vllm.entrypoints.responses_utils import (
+    construct_chat_message_with_tool_call,
     convert_tool_responses_to_completions_format,
 )
 
@@ -28,3 +39,65 @@ def test_convert_tool_responses_to_completions_format(self):
         result = convert_tool_responses_to_completions_format(input_tool)
 
         assert result == {"type": "function", "function": input_tool}
+
+    def test_construct_chat_message_with_tool_call(self):
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=[
+                Content(
+                    text="Leroy Jenkins",
+                    type="reasoning_text",
+                )
+            ],
+            encrypted_content=None,
+            status=None,
+        )
+        formatted_item = construct_chat_message_with_tool_call(item)
+        assert formatted_item["role"] == "assistant"
+        assert formatted_item["reasoning"] == "Leroy Jenkins"
+
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[
+                Summary(
+                    text='Hmm, the user has just started with a simple "Hello,"',
+                    type="summary_text",
+                )
+            ],
+            type="reasoning",
+            content=None,
+            encrypted_content=None,
+            status=None,
+        )
+
+        formatted_item = construct_chat_message_with_tool_call(item)
+        assert formatted_item["role"] == "assistant"
+        assert (
+            formatted_item["reasoning"]
+            == 'Hmm, the user has just started with a simple "Hello,"'
+        )
+
+        tool_call_output = ResponseFunctionToolCallOutputItem(
+            id="temp_id",
+            type="function_call_output",
+            call_id="temp",
+            output="1234",
+            status="completed",
+        )
+        formatted_item = construct_chat_message_with_tool_call(tool_call_output)
+        assert formatted_item["role"] == "tool"
+        assert formatted_item["content"] == "1234"
+        assert formatted_item["tool_call_id"] == "temp"
+
+        item = ResponseReasoningItem(
+            id="lol",
+            summary=[],
+            type="reasoning",
+            content=None,
+            encrypted_content="TOP_SECRET_MESSAGE",
+            status=None,
+        )
+        with pytest.raises(ValueError):
+            construct_chat_message_with_tool_call(item)
diff --git a/tests/evals/gsm8k/gsm8k_eval.py b/tests/evals/gsm8k/gsm8k_eval.py
index c7799607912b..0421f8bb1859 100644
--- a/tests/evals/gsm8k/gsm8k_eval.py
+++ b/tests/evals/gsm8k/gsm8k_eval.py
@@ -83,8 +83,12 @@ async def call_vllm_api(
     stop: list[str] | None = None,
     url: str | None = None,
     seed: int | None = None,
-) -> str:
-    """Call vLLM's OpenAI-compatible completions endpoint."""
+) -> tuple[str, int]:
+    """Call vLLM's OpenAI-compatible completions endpoint.
+
+    Returns:
+        Tuple of (response_text, completion_tokens)
+    """
     data = {
         "prompt": prompt,
         "temperature": temperature,
@@ -98,10 +102,12 @@ async def call_vllm_api(
         async with session.post(f"{url}/v1/completions", json=data) as response:
             response.raise_for_status()
             result = await response.json()
-            return result["choices"][0]["text"]
+            text = result["choices"][0]["text"]
+            completion_tokens = result.get("usage", {}).get("completion_tokens", 0)
+            return text, completion_tokens
     except Exception as e:
         print(f"Error calling vLLM API: {e}")
-        return ""
+        return "", 0
 
 
 def evaluate_gsm8k(
@@ -146,10 +152,11 @@ def evaluate_gsm8k(
     # Run evaluation
     async def run_async_evaluation():
         states: list[str] = [""] * num_questions
+        output_tokens: list[int] = [0] * num_questions
 
-        async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
+        async def get_answer(session: aiohttp.ClientSession, i: int) -> tuple[str, int]:
             prompt = few_shot_examples + questions[i]
-            answer = await call_vllm_api(
+            answer, tokens = await call_vllm_api(
                 session=session,
                 prompt=prompt,
                 temperature=temperature,
@@ -159,7 +166,8 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
                 seed=seed,
             )
             states[i] = answer
-            return answer
+            output_tokens[i] = tokens
+            return answer, tokens
 
         async with aiohttp.ClientSession(
             timeout=aiohttp.ClientTimeout(total=600)
@@ -167,24 +175,28 @@ async def get_answer(session: aiohttp.ClientSession, i: int) -> str:
             tasks = [get_answer(session, i) for i in range(num_questions)]
             await tqdm.gather(*tasks, desc="Evaluating")
 
-        return states
+        return states, output_tokens
 
     print(f"Running GSM8K evaluation: {num_questions} questions, {num_shots}-shot")
 
     tic = time.perf_counter()
-    states = asyncio.run(run_async_evaluation())
+    states, output_tokens = asyncio.run(run_async_evaluation())
     latency = time.perf_counter() - tic
 
     # Compute metrics
     preds = [get_answer_value(state) for state in states]
     accuracy = np.mean(np.array(preds) == np.array(labels))
     invalid_rate = np.mean(np.array(preds) == INVALID)
+    total_output_tokens = sum(output_tokens)
+    tokens_per_second = total_output_tokens / latency if latency > 0 else 0.0
 
     result = {
         "accuracy": accuracy,
         "invalid_rate": invalid_rate,
         "latency": latency,
         "questions_per_second": num_questions / latency,
+        "total_output_tokens": total_output_tokens,
+        "tokens_per_second": tokens_per_second,
         "num_questions": num_questions,
         "num_shots": num_shots,
         "max_tokens": max_tokens,
@@ -236,6 +248,8 @@ def main() -> None:
     print(f"Invalid responses: {result['invalid_rate']:.3f}")
     print(f"Total latency: {result['latency']:.3f} s")
     print(f"Questions per second: {result['questions_per_second']:.3f}")
+    print(f"Total output tokens: {result['total_output_tokens']}")
+    print(f"Output tokens per second: {result['tokens_per_second']:.3f}")
 
     # Optional file saving
     if args.save_results:
diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py
index 1dec46e33f22..8f58c470d217 100644
--- a/tests/kernels/attention/test_aiter_flash_attn.py
+++ b/tests/kernels/attention/test_aiter_flash_attn.py
@@ -6,6 +6,7 @@
 import torch
 
 import vllm.v1.attention.backends.rocm_aiter_fa  # noqa: F401
+from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.platforms import current_platform
 
 NUM_HEADS = [(4, 4), (8, 2)]
@@ -100,6 +101,8 @@ def test_varlen_with_paged_kv(
     num_blocks: int,
     q_dtype: torch.dtype | None,
 ) -> None:
+    if not is_flash_attn_varlen_func_available():
+        pytest.skip("flash_attn_varlen_func required to run this test.")
     torch.set_default_device("cuda")
     current_platform.seed_everything(0)
     num_seqs = len(seq_lens)
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 9662e73321eb..1a7d5ce0ddc1 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -13,12 +13,6 @@
 from vllm.platforms import current_platform
 from vllm.utils.mem_utils import get_max_shared_memory_bytes
 
-if not current_platform.is_rocm():
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
-
-    from tests.kernels.utils import make_alibi_bias
-
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 # This will change depending on the compute capability.
 # - 512 as a buffer
@@ -448,129 +442,6 @@ def ref_multi_query_kv_attention(
     return torch.cat(ref_outputs, dim=0)
 
 
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", HEAD_SIZES)
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-    use_alibi: bool = False,
-) -> None:
-    current_platform.seed_everything(seed)
-    torch.set_default_device(device)
-    # MAX_SEQ_LEN sometimes causes OOM in the reference implementation.
-    # As the xformers library is already tested with its own tests, we can use
-    # a smaller MAX_SEQ_LEN here.
-    max_len = min(MAX_SEQ_LEN, 4096)
-    seq_lens = random.sample(range(1, max_len), num_seqs)
-    num_tokens = sum(seq_lens)
-
-    scale = float(1.0 / (head_size**0.5))
-    num_query_heads, num_kv_heads = num_heads
-    qkv = torch.empty(
-        num_tokens, num_query_heads + 2 * num_kv_heads, head_size, dtype=dtype
-    )
-    qkv.uniform_(-scale, scale)
-    query, key, value = qkv.split([num_query_heads, num_kv_heads, num_kv_heads], dim=1)
-
-    num_queries_per_kv = num_query_heads // num_kv_heads
-    if num_queries_per_kv > 1:
-        # Handle MQA and GQA
-        key = torch.repeat_interleave(key, num_queries_per_kv, dim=1)
-        value = torch.repeat_interleave(value, num_queries_per_kv, dim=1)
-    alibi_bias = None
-    if use_alibi:
-        alibi_slopes = torch.randn(num_query_heads, dtype=torch.float)
-        attn_bias = make_alibi_bias(alibi_slopes, num_kv_heads, dtype, seq_lens)
-        output = torch.empty_like(query)
-        start = 0
-        # Dynamic sequence length not supported with custom attn_bias.
-        for i, seq_len in enumerate(seq_lens):
-            end = start + seq_len
-            out = xops.memory_efficient_attention_forward(
-                query[None, start:end],
-                key[None, start:end],
-                value[None, start:end],
-                attn_bias=attn_bias[i],
-                p=0.0,
-                scale=scale,
-            )
-            output[start:end].copy_(out.view_as(query[start:end]))
-            start += seq_len
-        # xformers.AttentionBias to Tensor for use in reference impl.
-        alibi_bias = [
-            b.materialize((1, num_query_heads, i, i), device=device).squeeze()
-            for b, i in zip(attn_bias, seq_lens)
-        ]
-    else:
-        attn_bias = BlockDiagonalCausalMask.from_seqlens(seq_lens)
-        output = xops.memory_efficient_attention_forward(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=attn_bias,
-            p=0.0,
-            scale=scale,
-        )
-        output = output.squeeze(0)
-
-    cu_seq_lens = [0]
-    for seq_len in seq_lens:
-        cu_seq_lens.append(cu_seq_lens[-1] + seq_len)
-    ref_output = ref_multi_query_kv_attention(
-        cu_seq_lens,
-        query,
-        key,
-        value,
-        scale,
-        alibi_bias,
-        dtype,
-    )
-    atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3
-    rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5
-    torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol)
-
-
-@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS)
-@pytest.mark.parametrize("num_heads", NUM_HEADS)
-@pytest.mark.parametrize("head_size", [64])
-@pytest.mark.parametrize("dtype", DTYPES)
-@pytest.mark.parametrize("seed", SEEDS)
-@pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.skipif(
-    current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
-)
-@torch.inference_mode()
-def test_multi_query_kv_attention_with_alibi(
-    num_seqs: int,
-    num_heads: tuple[int, int],
-    head_size: int,
-    dtype: torch.dtype,
-    seed: int,
-    device: str,
-) -> None:
-    return test_multi_query_kv_attention(
-        num_seqs,
-        num_heads,
-        head_size,
-        dtype,
-        seed,
-        device,
-        use_alibi=True,
-    )
-
-
 @pytest.mark.parametrize("attention_cls", [Attention, MultiHeadAttention])
 def test_num_heads_not_divisble_by_num_kv_heads(attention_cls: type) -> None:
     head_size = 64
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 3b8e939300a2..c959b2f4bb03 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -7,10 +7,10 @@
 import torch
 
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
+from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
 from vllm.platforms.cuda import CudaPlatform
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_INVALID_VAL
 
 
 @pytest.fixture(autouse=True)
@@ -33,7 +33,7 @@ def clear_cache():
 }
 
 DEVICE_REGULAR_ATTN_BACKENDS = {
-    "cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
+    "cuda": ["FLASHINFER", "FLASH_ATTN"],
     "hip": ["ROCM_ATTN"],
     "cpu": ["CPU_ATTN"],
 }
@@ -47,9 +47,11 @@ def clear_cache():
 
 
 def generate_params():
+    is_rocm = current_platform.is_rocm()
     params = []
+    device_list = ["cuda", "cpu"] if not is_rocm else ["hip", "cpu"]
     for use_mla in [True, False]:
-        for device in ["cuda", "hip", "cpu"]:
+        for device in device_list:
             backends = (
                 DEVICE_MLA_BACKENDS[device]
                 if use_mla
@@ -80,7 +82,7 @@ def test_env(
 ):
     """Test attention backend selection with valid device-backend pairs."""
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, name)
+        m.setenv("VLLM_ATTENTION_BACKEND", name)
         m.setenv("VLLM_MLA_DISABLE", "1" if use_mla else "0")
 
         if device == "cpu":
@@ -204,12 +206,6 @@ def test_env(
                     )
                     expected = "FLASHINFER"
                     assert backend.get_name() == expected
-                elif name == "XFORMERS":
-                    backend = get_attn_backend(
-                        32, torch.float16, None, block_size, use_mla=use_mla
-                    )
-                    expected = "XFORMERS"
-                    assert backend.get_name() == expected
                 elif name == "FLASH_ATTN":
                     backend = get_attn_backend(
                         32, torch.float16, None, block_size, use_mla=use_mla
@@ -240,27 +236,27 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     )
 
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")
 
         # Unsupported CUDA arch
         monkeypatch.setattr(torch.cuda, "get_device_capability", lambda _=None: (7, 5))
         backend = get_attn_backend(16, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Reset the monkeypatch for subsequent tests
         monkeypatch.undo()
 
         # Unsupported data type
         backend = get_attn_backend(16, torch.float8_e4m3fn, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Unsupported kv cache data type
         backend = get_attn_backend(16, torch.float16, "fp8", 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Unsupported block size
         backend = get_attn_backend(16, torch.float16, None, 8)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # flash-attn is not installed
         import sys
@@ -268,7 +264,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
         original_module = sys.modules.get("vllm_flash_attn")
         monkeypatch.setitem(sys.modules, "vllm_flash_attn", None)
         backend = get_attn_backend(16, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
         # Restore the original module if it existed
         if original_module is not None:
@@ -278,7 +274,7 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
 
         # Unsupported head size
         backend = get_attn_backend(17, torch.float16, None, 16)
-        assert backend.get_name() != STR_FLASH_ATTN_VAL
+        assert backend.get_name() != "FLASH_ATTN"
 
 
 def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
@@ -287,7 +283,7 @@ def test_invalid_env(monkeypatch: pytest.MonkeyPatch):
         monkeypatch.context() as m,
         patch("vllm.platforms.current_platform", CudaPlatform()),
     ):
-        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
+        m.setenv("VLLM_ATTENTION_BACKEND", "INVALID")
 
         # Should raise ValueError for invalid backend
         with pytest.raises(ValueError) as exc_info:
diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py
index f33a27d1fd85..acf46d75d62e 100644
--- a/tests/kernels/attention/test_cache.py
+++ b/tests/kernels/attention/test_cache.py
@@ -68,6 +68,7 @@ def test_copy_blocks(
         pytest.skip()
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     # Generate random block mappings where each source block is mapped to two
     # destination blocks.
     assert 2 * num_mappings <= num_blocks
@@ -152,6 +153,7 @@ def test_reshape_and_cache(
         pytest.skip()
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     # Create a random slot mapping.
     num_slots = block_size * num_blocks
     slot_mapping_lst = random.sample(range(num_slots), num_tokens)
@@ -272,6 +274,7 @@ def test_reshape_and_cache_flash(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
     assert implementation in ["cuda", "triton"]
     if implementation == "triton" and kv_cache_layout == "HND":
         pytest.skip("Triton implementation only supports NHD layout.")
@@ -593,6 +596,7 @@ def test_concat_and_cache_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -662,11 +666,14 @@ def test_concat_and_cache_ds_mla(
     seed: int,
     device: str,
 ) -> None:
+    if current_platform.is_rocm():
+        pytest.skip("concat_and_cache_mla doesn't support fp8_ds_mla on ROCm")
     if dtype.itemsize != 2:
         pytest.skip("ds_mla only supports 16-bit input")
     kv_cache_dtype = "fp8_ds_mla"
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     total_slots = num_blocks * block_size
     slot_mapping_lst = random.sample(range(total_slots), num_tokens)
@@ -779,6 +786,7 @@ def test_copy_blocks_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 
@@ -843,6 +851,7 @@ def test_swap_blocks_mla(
 ) -> None:
     current_platform.seed_everything(seed)
     torch.set_default_device(device)
+    torch.cuda.set_device(device)
 
     entry_size = kv_lora_rank + qk_rope_head_dim
 
@@ -912,12 +921,16 @@ def test_gather_and_maybe_dequant_cache_mla(
     )
     _fill_mla_cache(src_cache, kv_cache_dtype=kv_cache_dtype)
 
-    seq_len_tensor = torch.randint(0, max_seq_len + 1, (batch_size,), device=device)
+    seq_len_tensor = torch.randint(
+        max_seq_len, max_seq_len + 1, (batch_size,), device=device
+    )
 
     total_tokens = seq_len_tensor.sum()
     cu_seq_lens = torch.empty((batch_size + 1), dtype=torch.int32, device=device)
     cu_seq_lens[0] = 0
     cu_seq_lens[1:] = seq_len_tensor.cumsum(dim=0).to(dtype=torch.int32)
+    token_to_seq = torch.arange(0, batch_size, dtype=torch.int32, device=device)
+    token_to_seq = torch.repeat_interleave(token_to_seq, seq_len_tensor)
     print("seq_len_tensor", seq_len_tensor)
 
     tot_blocks_tensor = (seq_len_tensor + block_size - 1) // block_size
@@ -968,7 +981,8 @@ def test_gather_and_maybe_dequant_cache_mla(
             dst,
             block_table,
             cu_seq_lens,
-            batch_size,
+            token_to_seq,
+            total_tokens,
             kv_cache_dtype,
             scale,
             None,
@@ -981,7 +995,8 @@ def test_gather_and_maybe_dequant_cache_mla(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        total_tokens,
         kv_cache_dtype,
         scale,
         None,
diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py
index 4295f852f95b..d86041d71feb 100755
--- a/tests/kernels/attention/test_cascade_flash_attn.py
+++ b/tests/kernels/attention/test_cascade_flash_attn.py
@@ -7,11 +7,19 @@
 
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states
-from vllm.vllm_flash_attn import (
-    fa_version_unsupported_reason,
-    flash_attn_varlen_func,
-    is_fa_version_supported,
-)
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
 
 NUM_HEADS = [(4, 4), (8, 2), (16, 2)]
 HEAD_SIZES = [128, 192, 256]
@@ -170,6 +178,7 @@ def test_cascade(
         logits_soft_cap=soft_cap if soft_cap is not None else 0,
         block_table=block_tables,
         common_prefix_len=common_prefix_len,
+        max_num_splits=0,  # no max
         fa_version=fa_version,
     )
 
diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py
index 6e5468969bf2..bbd5df5419f8 100644
--- a/tests/kernels/attention/test_flash_attn.py
+++ b/tests/kernels/attention/test_flash_attn.py
@@ -6,21 +6,30 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.vllm_flash_attn import (
-    fa_version_unsupported_reason,
-    flash_attn_varlen_func,
-    is_fa_version_supported,
-)
+
+try:
+    from vllm.vllm_flash_attn import (
+        fa_version_unsupported_reason,
+        flash_attn_varlen_func,
+        is_fa_version_supported,
+    )
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "vllm_flash_attn is not supported for vLLM on ROCm.",
+            allow_module_level=True,
+        )
+
 
 NUM_HEADS = [(4, 4), (8, 2)]
-HEAD_SIZES = [128, 256]
+HEAD_SIZES = [40, 72, 80, 128, 256]
 BLOCK_SIZES = [16]
 DTYPES = [torch.bfloat16]
 QDTYPES = [None, torch.float8_e4m3fn]
 # one value large enough to test overflow in index calculation.
 # one value small enough to test the schema op check
 NUM_BLOCKS = [32768, 2048]
-SOFT_CAPS = [None, 50.0]
+SOFT_CAPS = [None]
 SLIDING_WINDOWS = [None, 256]
 
 
diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py
index 82ec2ef14e56..eedeec33e0d4 100644
--- a/tests/kernels/attention/test_flashinfer.py
+++ b/tests/kernels/attention/test_flashinfer.py
@@ -2,12 +2,20 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-import flashinfer
 import pytest
-import torch
 
 from vllm.platforms import current_platform
 
+try:
+    import flashinfer
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "flashinfer is not supported for vLLM on ROCm.", allow_module_level=True
+        )
+
+import torch
+
 NUM_HEADS = [(32, 8), (6, 1)]
 HEAD_SIZES = [128, 256]
 BLOCK_SIZES = [16, 32]
diff --git a/tests/kernels/attention/test_flashinfer_mla_decode.py b/tests/kernels/attention/test_flashinfer_mla_decode.py
index 0350136677c6..d183f67d3919 100644
--- a/tests/kernels/attention/test_flashinfer_mla_decode.py
+++ b/tests/kernels/attention/test_flashinfer_mla_decode.py
@@ -3,7 +3,6 @@
 import pytest
 import torch
 import torch.nn.functional as F
-from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 from torch import Tensor
 
 from vllm.platforms import current_platform
@@ -15,6 +14,8 @@
         reason="FlashInfer MLA Requires compute capability of 10 or above.",
         allow_module_level=True,
     )
+else:
+    from flashinfer.decode import trtllm_batch_decode_with_kv_cache_mla
 
 
 def ref_mla(
diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
index 693b849ebc5d..98ea40608b46 100644
--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import flashinfer
 import pytest
 import torch
 
@@ -16,6 +15,8 @@
     pytest.skip(
         "This TRTLLM kernel requires NVIDIA Blackwell.", allow_module_level=True
     )
+else:
+    import flashinfer
 
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = current_platform.fp8_dtype()
diff --git a/tests/kernels/attention/test_merge_attn_states.py b/tests/kernels/attention/test_merge_attn_states.py
index 9b084f2f660b..c7662223e1ca 100644
--- a/tests/kernels/attention/test_merge_attn_states.py
+++ b/tests/kernels/attention/test_merge_attn_states.py
@@ -150,8 +150,8 @@ def test_merge_attn_states(
     output_torch = output.clone()
     output_lse_torch = output_lse.clone()
     total_time_torch_kernel = 0
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
 
     # 0. Run the Torch kernel
     prefix_lse_torch = prefix_lse.clone()
@@ -188,8 +188,8 @@ def test_merge_attn_states(
     output_lse_ref_triton = output_lse.clone()
 
     total_time_triton_kernel = 0
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
+    start = torch.Event(enable_timing=True)
+    end = torch.Event(enable_timing=True)
 
     for _ in range(warmup_times):
         merge_attn_states_triton(
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
index 183bbf3bf4e0..ae3c63cc62d6 100644
--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -24,10 +24,6 @@
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
     _cached_get_attn_backend.cache_clear()
-    # Clear xformers availability cache
-    import vllm.attention.layer as layer_module
-
-    layer_module.USE_XFORMERS_OPS = None
 
 
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
@@ -62,38 +58,10 @@ def test_mha_attn_platform(device: str):
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
 
         # Test CUDA with head_size=72 (not divisible by 32)
-        # - with upstream FA not available
-        # - should use xformers
-        with (
-            patch("vllm.attention.layer.current_platform", CudaPlatform()),
-            patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
-            patch(
-                "vllm.attention.layer.check_upstream_fa_availability",
-                return_value=False,
-            ),
-        ):
-            attn = MultiHeadAttention(16, 72, scale=1)
-            assert attn.attn_backend == AttentionBackendEnum.XFORMERS
-
-        # Test CUDA with head_size=72 (not divisible by 32)
-        # - with upstream FA available
-        # - should use upstream FA
+        # - should use vLLM's FlashAttention
         with (
             patch("vllm.attention.layer.current_platform", CudaPlatform()),
             patch("vllm.model_executor.models.vision.current_platform", CudaPlatform()),
-            patch(
-                "vllm.attention.layer.check_upstream_fa_availability", return_value=True
-            ),
-            patch.dict(
-                "sys.modules",
-                {
-                    "flash_attn": type(
-                        "MockFlashAttn",
-                        (),
-                        {"flash_attn_varlen_func": lambda *args, **kwargs: None},
-                    )()
-                },
-            ),
         ):
             attn = MultiHeadAttention(16, 72, scale=1)
             assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN
diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py
index 78cdbbbf7379..e041e8c8d2ff 100644
--- a/tests/kernels/attention/test_prefix_prefill.py
+++ b/tests/kernels/attention/test_prefix_prefill.py
@@ -174,11 +174,11 @@ def test_contexted_kv_attention(
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):
@@ -417,11 +417,11 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
     block_table = values[: BS * max_block_per_request].view(BS, max_block_per_request)
     b_seq_len = torch.tensor(seq_lens, dtype=torch.int32)
     b_ctx_len = torch.tensor(ctx_lens, dtype=torch.int32)
-    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens, dtype=torch.int32), dim=0)
+    b_start_loc = torch.cumsum(torch.tensor([0] + query_lens), dim=0).to(torch.int32)
     max_input_len = MAX_SEQ_LEN
     # copy kv to cache
-    b_seq_start_loc = torch.cumsum(
-        torch.tensor([0] + seq_lens[:-1], dtype=torch.int32), dim=0
+    b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1]), dim=0).to(
+        torch.int32
     )
     for i in range(BS):
         for j in range(query_lens[i]):
diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py
index 9b7fb664956c..b61058081c0b 100644
--- a/tests/kernels/attention/test_rocm_attention_selector.py
+++ b/tests/kernels/attention/test_rocm_attention_selector.py
@@ -6,7 +6,6 @@
 
 from vllm.attention.selector import _cached_get_attn_backend, get_attn_backend
 from vllm.platforms.rocm import RocmPlatform
-from vllm.utils import STR_BACKEND_ENV_VAR
 
 
 @pytest.fixture(autouse=True)
@@ -18,7 +17,7 @@ def clear_cache():
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):
     with monkeypatch.context() as m:
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_ATTN")
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_ATTN")
 
         # Set the current platform to ROCm using monkeypatch
         monkeypatch.setattr("vllm.attention.selector.current_platform", RocmPlatform())
@@ -30,19 +29,19 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # MLA test for deepseek related
 
         # change the attention backend to triton MLA
-        m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA")
+        m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # If attention backend is None
         # If use_mla is true
         # The selected backend is triton MLA
-        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, use_mla=True)
         assert backend.get_name() == "TRITON_MLA"
 
         # change the attention backend to AITER MLA
-        m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
+        m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_MLA")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
 
@@ -50,7 +49,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
         # If use_mla is true
         # If VLLM_ROCM_USE_AITER is enabled
         # The selected backend is ROCM_AITER_MLA
-        m.setenv(STR_BACKEND_ENV_VAR, None)
+        m.setenv("VLLM_ATTENTION_BACKEND", "")
         m.setenv("VLLM_ROCM_USE_AITER", "1")
         backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, use_mla=True)
         assert backend.get_name() == "ROCM_AITER_MLA"
diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py
index 02b795721f46..43b242ab2d58 100644
--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -5,11 +5,11 @@
 import pytest
 import torch
 from packaging.version import Version
-from transformers import AutoConfig
 from transformers import __version__ as TRANSFORMERS_VERSION
 
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_config
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -98,8 +98,7 @@ def test_mrope(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -113,7 +112,6 @@ def test_mrope(
     )
     is_neox_style = True
 
-    rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -122,9 +120,8 @@ def test_mrope(
         head_size=head_dim,
         rotary_dim=rotary_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
@@ -173,8 +170,7 @@ def test_mrope_torch_compile_tracing(
     atol = model_info.atol
     rtol = model_info.rtol
 
-    config = AutoConfig.from_pretrained(model_name)
-    config = config.get_text_config()
+    config = get_config(model_name, False).get_text_config()
 
     # get the model config
     total_num_kv_heads = config.num_key_value_heads
@@ -187,7 +183,6 @@ def test_mrope_torch_compile_tracing(
         else config.hidden_size // total_num_heads
     )
     is_neox_style = True
-    rope_theta = config.rope_theta
     max_position = config.max_position_embeddings
     partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
     rotary_dim = int(head_dim * partial_rotary_factor)
@@ -196,9 +191,8 @@ def test_mrope_torch_compile_tracing(
         head_size=head_dim,
         rotary_dim=rotary_dim,
         max_position=max_position,
-        base=rope_theta,
         is_neox_style=is_neox_style,
-        rope_scaling=config.rope_scaling,
+        rope_parameters=config.rope_parameters,
         dtype=dtype,
     ).to(device=device)
 
diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py
index c35ee5016ba0..a8ed3825689d 100644
--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -74,7 +74,7 @@ def test_rotary_embedding(
     device: str,
     use_key: bool,
     max_position: int = 8192,
-    base: float = 10000,
+    rope_theta: float = 10000,
 ) -> None:
     if rotary_dim is None:
         rotary_dim = head_size
@@ -83,7 +83,8 @@ def test_rotary_embedding(
     torch.set_default_device(device)
     if rotary_dim is None:
         rotary_dim = head_size
-    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+    rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+    rope = get_rope(head_size, rotary_dim, max_position, is_neox_style, rope_parameters)
     rope = rope.to(dtype=dtype, device=torch.get_default_device())
 
     positions = torch.randint(0, max_position, (batch_size, seq_len))
@@ -120,9 +121,9 @@ def test_rotary_embedding(
 @torch.inference_mode()
 def test_rope_module_cache():
     MAX_POSITIONS = [123, 1234]
-    BASES = [10000, 1000000]
-    ROPE_SCALINGS = (
-        None,
+    ROPE_THETAS = [10000, 1000000]
+    ROPE_PARAMETERS = (
+        {"rope_type": "default"},
         {"rope_type": "linear", "factor": (1,)},
         {"rope_type": "dynamic", "factor": 1},
     )
@@ -130,9 +131,9 @@ def test_rope_module_cache():
         HEAD_SIZES,
         ROTARY_DIMS,
         MAX_POSITIONS,
-        BASES,
+        ROPE_THETAS,
         IS_NEOX_STYLE,
-        ROPE_SCALINGS,
+        ROPE_PARAMETERS,
         DTYPES,
     )
     rope_setting_id_map: dict[str, int] = {}
@@ -141,20 +142,20 @@ def test_rope_module_cache():
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            rope_theta,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
+        rope_parameters["rope_theta"] = rope_theta
         rope = get_rope(
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # different settings cannot share the same rope module
@@ -168,20 +169,20 @@ def test_rope_module_cache():
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            rope_theta,
+            is_neox_style,
+            rope_parameters,
             dtype,
         ) = setting
         if rotary_dim is None:
             rotary_dim = head_size
+        rope_parameters["rope_theta"] = rope_theta
         rope = get_rope(
             head_size,
             rotary_dim,
             max_position,
-            base,
-            is_neox_stype,
-            rope_scaling,
+            is_neox_style,
+            rope_parameters,
             dtype,
         )
         # check if cache take effect
diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py
index 1d925dc1bea8..d95c22fdf0a5 100644
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -15,7 +15,11 @@
 )
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig
-from vllm.distributed import get_dp_group, get_tensor_model_parallel_world_size
+from vllm.distributed import (
+    get_dp_group,
+    get_pcp_group,
+    get_tensor_model_parallel_world_size,
+)
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
@@ -561,6 +565,7 @@ def next_power_of_2(x):
     # make moe config
     moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
         tp_size_=get_tensor_model_parallel_world_size(),
+        pcp_size_=get_pcp_group().world_size,
         dp_size_=get_dp_group().world_size,
         vllm_parallel_config=vllm_config.parallel_config,
     )
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
index 62704bbcbbc7..dab1207d7803 100644
--- a/tests/kernels/moe/test_batched_moe.py
+++ b/tests/kernels/moe/test_batched_moe.py
@@ -39,9 +39,12 @@
 NUM_EXPERTS = [8, 64]
 TOP_KS = [1, 2, 6]
 
+DTYPES = [torch.bfloat16]
+
+if not current_platform.is_fp8_fnuz():
+    DTYPES.append(torch.float8_e4m3fn)
+
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclass
@@ -98,7 +101,7 @@ def make_tensors(config: BatchedMMConfig):
 @pytest.mark.parametrize("max_tokens_per_expert", [32, 224, 512])
 @pytest.mark.parametrize("K", [128, 1024])
 @pytest.mark.parametrize("N", [128, 1024])
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(
@@ -231,7 +234,7 @@ def test_batched_mm(
 @pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("input_scales", [False])
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
index cd34617ee0fc..b0ff1e64e321 100644
--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -31,10 +31,13 @@
 
 if current_platform.get_device_capability() < (9, 0):
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
index 3799e60f1294..e35ca4caa9db 100644
--- a/tests/kernels/moe/test_block_int8.py
+++ b/tests/kernels/moe/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.bfloat16]
 
diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py
new file mode 100644
index 000000000000..af1a34d17d48
--- /dev/null
+++ b/tests/kernels/moe/test_cutedsl_moe.py
@@ -0,0 +1,582 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+
+from vllm.platforms import current_platform
+
+if not current_platform.has_device_capability(100):
+    pytest.skip(
+        reason="Nvfp4 Requires compute capability of 10 or above.",
+        allow_module_level=True,
+    )
+
+import torch
+from flashinfer import fp4_quantize
+from torch.nn import functional as F
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+    flashinfer_cutedsl_moe_masked,
+)
+from vllm.utils.flashinfer import (
+    flashinfer_cutedsl_grouped_gemm_nt_masked as cutedsl_gmm_masked,
+)
+from vllm.utils.flashinfer import (
+    scaled_fp4_grouped_quantize,
+)
+
+kE2M1ToFloat = torch.tensor(
+    [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
+)
+
+FLOAT8_E4M3_MAX = 448.0
+FLOAT4_E2M1_MAX = 6.0
+
+
+def convert_swizzled_to_linear(a_sf_swizzled: torch.Tensor, m, k, block_size):
+    m_tiles = (m + 128 - 1) // 128
+    f = block_size * 4
+    k_tiles = (k + f - 1) // f
+    tmp = torch.reshape(a_sf_swizzled, (1, m_tiles, k_tiles, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    out = tmp.reshape(m_tiles * 128, k_tiles * f // block_size)
+    return out[0:m, 0:k]
+
+
+def dequantize_nvfp4_to_dtype(
+    tensor_fp4, tensor_sf, global_scale, dtype, device, block_size=16
+):
+    """Dequantize the fp4 tensor back to high precision."""
+    # Two fp4 values are packed into one uint8.
+    assert tensor_fp4.dtype == torch.uint8
+    m, packed_k = tensor_fp4.shape
+    k = packed_k * 2
+    tensor_f32 = break_fp4_bytes(tensor_fp4, dtype)
+    tensor_f32 = tensor_f32.reshape(m, k // block_size, block_size)
+    tensor_sf = tensor_sf.view(torch.float8_e4m3fn)
+    tensor_sf = convert_swizzled_to_linear(tensor_sf, m, k, block_size)
+    tensor_sf_dtype = tensor_sf.to(torch.float32) / global_scale
+
+    # scale the tensor
+    out = (tensor_f32 * tensor_sf_dtype.unsqueeze(-1)).reshape(m, k)
+    return out.to(dtype=dtype)
+
+
+def break_fp4_bytes(a, dtype):
+    assert a.dtype == torch.uint8
+    m, n = a.shape
+
+    # Vectorized nibble processing
+    a_flat = a.flatten()
+    high = (a_flat & 0xF0) >> 4  # Upper nibbles
+    low = a_flat & 0x0F  # Lower nibbles
+
+    # Combine nibbles for batch processing
+    combined = torch.stack((low, high), dim=1).flatten()
+
+    # Vectorized sign and magnitude extraction
+    signs = (combined & 0x08).to(torch.bool)  # Sign bits
+    abs_vals = (combined & 0x07).to(torch.long)  # Magnitude indices
+
+    # Device-aware lookup and sign application
+    kE2M1 = kE2M1ToFloat.to(device=a.device)
+    values = kE2M1[abs_vals] * torch.where(signs, -1.0, 1.0)
+
+    # Reshape to final form
+    return values.reshape(m, n * 2).to(dtype=dtype)
+
+
+def generate_balanced_routing(
+    hidden_states: torch.Tensor, num_experts: int, top_k: int
+):
+    """
+    Generate routing weights and topk indices such that every expert is active.
+    Returns routing_weights, topk_idx
+    """
+
+    num_tokens, hidden_dim = hidden_states.shape
+    #   num_tokens = batch_size * seq_len
+
+    # First, assign at least one token per expert
+    tokens_per_expert = torch.arange(num_tokens) % num_experts
+    tokens_per_expert = tokens_per_expert[torch.randperm(num_tokens)]  # shuffle
+
+    # Each token has top_k experts — start with one guaranteed expert
+    topk_idx = torch.full((num_tokens, top_k), -1, dtype=torch.long)
+    topk_idx[:, 0] = tokens_per_expert
+
+    # For remaining top_k - 1 experts, pick randomly (allowing repeats)
+    if top_k > 1:
+        random_choices = torch.randint(0, num_experts, (num_tokens, top_k - 1))
+        topk_idx[:, 1:] = random_choices
+
+    # Normalize routing weights so each token's weights sum to 1
+    routing_weights = torch.rand(num_tokens, top_k)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+    # Reshape back if needed
+    routing_weights = routing_weights.view(num_tokens, top_k)
+    topk_idx = topk_idx.view(num_tokens, top_k)
+
+    return routing_weights, topk_idx
+
+
+def prepare_inputs(
+    hidden_states: torch.Tensor,
+    router_logits: torch.Tensor,
+    num_experts: int,
+    topk: int,
+):
+    routing_weights, topk_idx = generate_balanced_routing(
+        router_logits, num_experts, topk
+    )
+
+    masked_m = []
+    for i in range(num_experts):
+        mask = topk_idx.view(-1) == i
+        masked_m.append(mask.sum())
+
+    masked_m = torch.tensor(masked_m, dtype=torch.int32)
+    # Intialize the hidden_states_3d with ones instead of empty to avoid nan
+    # issue.
+    hidden_states_3d = torch.ones(
+        (num_experts, max(masked_m), hidden_states.shape[1]), dtype=hidden_states.dtype
+    )
+    for i in range(num_experts):
+        hidden_states_3d[i, : masked_m[i], :] = hidden_states[topk_idx.view(-1) == i]
+
+    return hidden_states_3d, masked_m, topk_idx, routing_weights
+
+
+MNK_FACTORS = [
+    (2, 1024, 1024),
+    (2, 1024, 1536),
+    (2, 3072, 1024),
+    (2, 3072, 1536),
+    (64, 1024, 1024),
+    (64, 1024, 1536),
+    (64, 3072, 1024),
+    (64, 2048, 1024),
+    (224, 1024, 1024),
+    (224, 1024, 1536),
+]
+
+
+# Reference implementation of torch_moe
+def torch_moe(a, w1, w2, score, topk, expert_map):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+    if expert_map is not None:
+        topk_ids = expert_map[topk_ids]
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            out[mask] = SiluAndMul()(a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(
+                0, 1
+            )
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def torch_moe_nvfp4(a, w1, w2, topk, topk_weight, topk_ids):
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            m = w1[i].shape[0]
+            assert m % 2 == 0
+            # Note: w1 and w3 are swapped!
+            w3_expert, w1_expert = w1[i][m // 2 :, :], w1[i][: m // 2, :]
+            inter = F.silu(a[mask] @ w1_expert.t()) * (a[mask] @ w3_expert.t())
+            inter_gs = torch.tensor(1.0).cuda()
+            inter_q, inter_blockscale = fp4_quantize(inter, inter_gs)
+            inter = dequantize_nvfp4_to_dtype(
+                inter_q,
+                inter_blockscale,
+                inter_gs,
+                dtype=inter.dtype,
+                device=inter.device,
+                block_size=16,
+            ).cuda()
+            out[mask] = inter @ w2[i].transpose(0, 1)
+    return (
+        out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)
+    ).sum(dim=1)
+
+
+def grouped_gemm_ref(
+    hidden_states_expanded: torch.Tensor,
+    hidden_states_3d: torch.Tensor,
+    weights: torch.Tensor,
+    topk_idx: torch.Tensor,
+    masked_m: torch.Tensor,
+    B: int,
+    topk: int,
+    num_experts: int,
+    *,
+    block_size: int = 16,
+) -> torch.Tensor:
+    """
+    Computes the reference grouped GEMM (fp4 quantized per-expert loop),
+    computes flashinfer grouped GEMM (for scale consistency),
+    and returns ONLY the repacked reference output: out_ref.
+
+    Returns:
+        out_ref: Tensor [num_experts, max_m, n_out]
+    """
+    device_hs = hidden_states_expanded.device
+    device_w = weights.device
+    out_dtype = weights.dtype
+    n_out = weights.shape[1]
+
+    # Flattened reference output (B*topk, n_out)
+    out = torch.zeros((B * topk, n_out), dtype=out_dtype, device=device_w)
+
+    # Per-expert reference compute loop
+    for i in range(num_experts):
+        mask = topk_idx.view(-1) == i
+        if mask.any():
+            lhs = hidden_states_expanded[mask]
+            rhs = weights[i]
+
+            a_amax = lhs.abs().max().to(torch.float32).to(device_hs)
+            b_amax = rhs.abs().max().to(torch.float32).to(device_w)
+
+            a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
+            b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
+
+            lhsq, lhsq_sf = fp4_quantize(lhs, a_gs)
+            rhsq, rhsq_sf = fp4_quantize(rhs, b_gs)
+
+            lhs_in_dtype = dequantize_nvfp4_to_dtype(
+                lhsq,
+                lhsq_sf,
+                a_gs,
+                dtype=lhs.dtype,
+                device=device_hs,
+                block_size=block_size,
+            )
+            rhs_in_dtype = dequantize_nvfp4_to_dtype(
+                rhsq,
+                rhsq_sf,
+                b_gs,
+                dtype=rhs.dtype,
+                device=device_w,
+                block_size=block_size,
+            )
+
+            out[mask] = lhs_in_dtype @ rhs_in_dtype.t()
+
+    # Determine per-expert max_m
+    max_m_val = int(masked_m.max().item())
+
+    # Repack into [num_experts, max_m, n_out]
+    out_ref = torch.zeros(
+        (num_experts, max_m_val, n_out),
+        dtype=out.dtype,
+        device=out.device,
+    )
+    expert_slot = [0] * num_experts
+
+    for i, expert_id in enumerate(topk_idx.view(-1).tolist()):
+        slot = expert_slot[expert_id]
+        if slot < max_m_val:
+            out_ref[expert_id, slot, :] = out[i]
+            expert_slot[expert_id] += 1
+        else:
+            raise IndexError(
+                f"Expert {expert_id} exceeded max slots ({max_m_val}). "
+                "Increase max_m or check masked_m."
+            )
+
+    return out_ref
+
+
+def flashinfer_cutedsl_grouped_gemm_nt_masked(
+    hidden_states: torch.Tensor,  # 3d
+    input_global_scale: torch.Tensor,  # (l,)
+    weights: torch.Tensor,
+    w_global_scale: torch.Tensor,  # (l,)
+    masked_m: torch.Tensor,
+):
+    # hidden_states: [l, m, k]
+    # weights: [l, n, k]
+    aq, aq_sf = scaled_fp4_grouped_quantize(
+        hidden_states,
+        masked_m.to(hidden_states.device),
+        input_global_scale,
+    )
+    num_experts, n, k = weights.shape
+    bq, bq_sf = scaled_fp4_grouped_quantize(
+        weights,
+        torch.full((num_experts,), n, device=weights.device, dtype=torch.int32),
+        w_global_scale,
+    )
+
+    out = torch.zeros(
+        (num_experts, max(masked_m), n), dtype=weights.dtype, device=aq.device
+    )
+    out = out.permute(1, 2, 0)  # requirement of kernel
+    sf_vec_size = 16
+    ab_dtype = "float4_e2m1fn"
+    sf_dtype = "float8_e4m3fn"
+    c_dtype = "bfloat16"
+    alpha = 1.0 / (input_global_scale * w_global_scale).to(out.dtype).view(
+        1, 1, num_experts
+    )
+
+    def get_cute_dtype(input: torch.Tensor) -> str:
+        if input.dtype == torch.bfloat16:
+            return "bfloat16"
+        elif input.dtype == torch.float16:
+            return "float16"
+        elif input.dtype == torch.float32:
+            return "float32"
+        else:
+            raise ValueError(f"Unsupported cute dtype {input.dtype}")
+
+    cutedsl_gmm_masked(
+        (aq, aq_sf),
+        (bq, bq_sf),
+        out,
+        masked_m.to(aq.device),
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=alpha,
+        alpha_dtype=get_cute_dtype(alpha),
+    )
+
+    return out
+
+
+@pytest.mark.parametrize("bs, hidden_dim, inter_dim", [(2, 128, 256), (16, 128, 512)])
+@pytest.mark.parametrize("topk", [1, 2, 4])
+@torch.inference_mode()
+def test_flashinfer_cutedsl_moe_masked(
+    bs: int, hidden_dim: int, inter_dim: int, topk: int
+):
+    torch.manual_seed(42)
+    device = "cuda"
+    num_experts = 8
+    hidden_states = (
+        torch.randn(bs, hidden_dim, dtype=torch.bfloat16, device=device) / 5.0
+    )
+    w1 = (
+        torch.randn(
+            num_experts, 2 * inter_dim, hidden_dim, dtype=torch.bfloat16, device=device
+        )
+        / 10.0
+    )
+    w2 = (
+        torch.randn(
+            num_experts, hidden_dim, inter_dim, dtype=torch.bfloat16, device=device
+        )
+        / 10.0
+    )
+    router_logits = torch.randn(bs, num_experts, dtype=torch.float32)
+
+    hidden_states_expanded = (
+        hidden_states.view(bs, -1, hidden_dim)
+        .repeat(1, topk, 1)
+        .reshape(-1, hidden_dim)
+    )
+    hidden_states_3d, masked_m, topk_idx, routing_weights = prepare_inputs(
+        hidden_states_expanded, router_logits, num_experts, topk
+    )
+
+    w1_amax = w1.abs().amax(dim=(1, 2)).to(torch.float32).to(w1.device)
+    w2_amax = w2.abs().amax(dim=(1, 2)).to(torch.float32).to(w2.device)
+    input_global_scale = torch.ones(
+        (num_experts,), dtype=torch.float32, device=hidden_states.device
+    )
+
+    w1_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w1_amax
+    w2_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / w2_amax
+    a2_global_scale = torch.ones(
+        (num_experts,), dtype=torch.float32, device=hidden_states.device
+    )  # assume intermediate scale is 1.0
+
+    w1_fp4, w1_blockscale = scaled_fp4_grouped_quantize(
+        w1,
+        torch.ones(num_experts, dtype=torch.int32, device=w1.device) * 2 * inter_dim,
+        w1_global_scale,
+    )
+    w2_fp4, w2_blockscale = scaled_fp4_grouped_quantize(
+        w2,
+        torch.ones(num_experts, dtype=torch.int32, device=w2.device) * hidden_dim,
+        w2_global_scale,
+    )
+
+    w1_alpha = 1.0 / (input_global_scale * w1_global_scale)
+    w2_alpha = 1.0 / (a2_global_scale * w2_global_scale)
+
+    out = torch.empty_like(hidden_states_3d)
+    # Note: the 1st dim shouldn't be bs
+    wk = torch.empty(
+        num_experts,
+        hidden_states_3d.shape[1],
+        inter_dim * 2,
+        dtype=hidden_states_3d.dtype,
+        device=hidden_states.device,
+    )
+    flashinfer_cutedsl_moe_masked(
+        hidden_states_3d.to(hidden_states.device),
+        input_global_scale,
+        w1_fp4.permute(2, 0, 1),
+        w1_blockscale,
+        w1_alpha,
+        w2_fp4.permute(2, 0, 1),
+        a2_global_scale,
+        w2_blockscale,
+        w2_alpha,
+        masked_m.to(hidden_states.device),
+        wk,
+        out,
+    )
+
+    # reference
+    a_fp4, a_scale_interleaved = fp4_quantize(hidden_states, input_global_scale)
+    a_in_dtype = dequantize_nvfp4_to_dtype(
+        a_fp4,
+        a_scale_interleaved,
+        input_global_scale,
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+        block_size=16,
+    )
+    w1_d = torch.empty(
+        (num_experts, 2 * inter_dim, hidden_dim), device=w1.device, dtype=w1.dtype
+    )
+    w2_d = torch.empty(
+        (num_experts, hidden_dim, inter_dim), device=w2.device, dtype=w2.dtype
+    )
+
+    for idx in range(0, num_experts):
+        w1_fp4_sliced, w1_blockscale_sliced = fp4_quantize(
+            w1[idx], w1_global_scale[idx]
+        )
+        w2_fp4_sliced, w2_blockscale_sliced = fp4_quantize(
+            w2[idx], w2_global_scale[idx]
+        )
+        w1_d[idx] = dequantize_nvfp4_to_dtype(
+            w1_fp4_sliced,
+            w1_blockscale_sliced,
+            w1_global_scale[idx],
+            dtype=w1.dtype,
+            device=w1.device,
+            block_size=16,
+        )
+        w2_d[idx] = dequantize_nvfp4_to_dtype(
+            w2_fp4_sliced,
+            w2_blockscale_sliced,
+            w2_global_scale[idx],
+            dtype=w2.dtype,
+            device=w2.device,
+            block_size=16,
+        )
+
+    ref_output = torch_moe_nvfp4(
+        a_in_dtype,
+        w1_d,
+        w2_d,
+        topk,
+        routing_weights.to(a_in_dtype.device),
+        topk_idx.to(a_in_dtype.device),
+    )
+    out_weighted = torch.zeros_like(ref_output, device=out.device, dtype=out.dtype)
+
+    positions = torch.nonzero(masked_m[topk_idx], as_tuple=False)
+    rows, cols = positions[:, 0], positions[:, 1]
+    experts = topk_idx[rows, cols]
+    for i in range(num_experts):
+        mask = experts == i
+        if mask.any():
+            idx = torch.nonzero(mask, as_tuple=False).squeeze(-1)
+            r, c = rows[idx], cols[idx]
+            out_weighted[r] += out[i, : len(r), :] * routing_weights[r, c].to(
+                out.device
+            ).unsqueeze(-1)
+    torch.testing.assert_close(
+        out_weighted.cpu(), ref_output.cpu(), atol=2e-1, rtol=2e-1
+    )
+
+
+@pytest.mark.parametrize(
+    "bs, hidden_dim, inter_dim, topk", [(2, 128, 256, 2), (16, 128, 512, 5)]
+)
+@torch.inference_mode()
+def test_grouped_gemm_nt_masked(
+    bs: int, hidden_dim: int, inter_dim: int, topk: int
+) -> None:
+    torch.manual_seed(42)
+    B = bs
+    D = hidden_dim
+    N = inter_dim
+    # CuteDSL group gemm has issue when not all experts are active.
+    # i.e. masked = [2, 3, 0, 0, 1] where the 2nd and 3rd experts are inactive
+    # see https://github.com/flashinfer-ai/flashinfer/issues/1856
+    num_experts = bs
+    hidden_states = torch.randn(B, D, dtype=torch.bfloat16, device="cuda")
+    weights = torch.randn(num_experts, N, D, dtype=torch.bfloat16, device="cuda")
+    router_logits = torch.randn(B, num_experts, dtype=torch.float32)
+
+    hidden_states_expanded = (
+        hidden_states.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    )
+    hidden_states_3d, masked_m, topk_idx, _ = prepare_inputs(
+        hidden_states_expanded, router_logits, num_experts, topk
+    )
+
+    a_amax = (
+        hidden_states_3d.abs()
+        .amax(dim=(1, 2))
+        .to(torch.float32)
+        .to(hidden_states.device)
+    )
+    b_amax = weights.abs().amax(dim=(1, 2)).to(torch.float32).to(weights.device)
+    a_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / a_amax
+    b_gs = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
+    out_flashinfer = flashinfer_cutedsl_grouped_gemm_nt_masked(
+        hidden_states_3d.to(hidden_states.device), a_gs, weights, b_gs, masked_m
+    )
+    # reference
+    out_ref = grouped_gemm_ref(
+        hidden_states_expanded=hidden_states_expanded,
+        hidden_states_3d=hidden_states_3d,
+        weights=weights,
+        topk_idx=topk_idx,
+        masked_m=masked_m,
+        B=B,
+        topk=topk,
+        num_experts=num_experts,
+    )
+    # Note: just to compare the masked position due to cutedsl may write nan
+    # into unmasked position.
+    for i in range(num_experts):
+        torch.testing.assert_close(
+            out_flashinfer.permute(2, 0, 1)[i, : masked_m[i]],
+            out_ref.to(out_flashinfer.device)[i, : masked_m[i]],
+            atol=1e-1,
+            rtol=1e-1,
+        )
+
+
+if __name__ == "__main__":
+    test_flashinfer_cutedsl_moe_masked(16, 128, 512, 4)
+    test_grouped_gemm_nt_masked(16, 128, 512, 4)
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
index 5512ccce47b0..c15837f14570 100644
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -42,8 +42,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 @dataclasses.dataclass
diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py
index 9d039b81690a..455ecacef5ec 100644
--- a/tests/kernels/moe/test_deepep_deepgemm_moe.py
+++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py
@@ -7,6 +7,7 @@
 """
 
 import dataclasses
+from contextlib import contextmanager
 
 import pytest
 import torch.distributed
@@ -14,6 +15,7 @@
 from typing_extensions import ParamSpec
 
 from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
@@ -21,7 +23,11 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
 from vllm.platforms import current_platform
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from vllm.utils.deep_gemm import (
+    get_mk_alignment_for_contiguous_layout,
+    is_deep_gemm_e8m0_used,
+    is_deep_gemm_supported,
+)
 from vllm.utils.import_utils import has_deep_ep, has_deep_gemm
 
 from ...utils import multi_gpu_test
@@ -57,6 +63,23 @@
 P = ParamSpec("P")
 
 
+@contextmanager
+def with_dp_metadata(M: int, world_size: int):
+    num_tokens_across_dp = torch.tensor([M] * world_size, device="cpu", dtype=torch.int)
+
+    vllm_config = VllmConfig()
+    vllm_config.parallel_config.data_parallel_size = world_size
+    vllm_config.parallel_config.enable_expert_parallel = True
+
+    with set_forward_context(
+        None,
+        vllm_config,
+        num_tokens=M,
+        num_tokens_across_dp=num_tokens_across_dp,
+    ):
+        yield
+
+
 def next_power_of_2(x):
     import math
 
@@ -281,18 +304,21 @@ def build_expert_map():
         quant_config=quant_config,
     )
 
-    out = mk.forward(
-        hidden_states=test_tensors.rank_tokens,
-        w1=w1,
-        w2=w2,
-        topk_weights=test_tensors.topk_weights,
-        topk_ids=test_tensors.topk,
-        inplace=False,
-        activation="silu",
-        global_num_experts=num_experts,
-        expert_map=build_expert_map(),
-        apply_router_weight_on_input=False,
-    )
+    with with_dp_metadata(
+        M=test_tensors.rank_tokens.size(0), world_size=pgi.world_size
+    ):
+        out = mk.forward(
+            hidden_states=test_tensors.rank_tokens,
+            w1=w1,
+            w2=w2,
+            topk_weights=test_tensors.topk_weights,
+            topk_ids=test_tensors.topk,
+            inplace=False,
+            activation="silu",
+            global_num_experts=num_experts,
+            expert_map=build_expert_map(),
+            apply_router_weight_on_input=False,
+        )
     return out
 
 
@@ -413,19 +439,16 @@ def _test_deepep_deepgemm_moe(
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ht_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
     topk: int,
     world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
     """
     Tests for High-Throughput DeepEP + DeepGemm integration.
     """
-    import deep_gemm
 
     m, n, k = mnk
     current_platform.seed_everything(7)
@@ -433,7 +456,7 @@ def test_ht_deepep_deepgemm_moe(
     if topk > num_experts:
         pytest.skip(f"Skipping test: topk={topk} > E={num_experts}")
 
-    block_m = deep_gemm.get_m_alignment_for_contiguous_layout()
+    block_m = get_mk_alignment_for_contiguous_layout()[0]
     block_size = [block_m, block_m]
 
     world_size, dp_size = world_dp_size
@@ -487,9 +510,6 @@ def test_ht_deepep_deepgemm_moe(
 @multi_gpu_test(num_gpus=2)
 @requires_deep_ep
 @requires_deep_gemm
-@pytest.mark.skipif(
-    is_deep_gemm_e8m0_used(), reason="Skipping test for Blackwell DeepGEMM"
-)
 def test_ll_deepep_deepgemm_moe(
     mnk: tuple[int, int, int],
     num_experts: int,
@@ -497,10 +517,12 @@ def test_ll_deepep_deepgemm_moe(
     use_fp8_dispatch: bool,
     block_size: list[int],
     world_dp_size: tuple[int, int],
+    disable_deepgemm_ue8m0,
 ):
     """
     Tests for Low-Latency DeepEP + DeepGemm integration.
     """
+    assert not is_deep_gemm_e8m0_used()
 
     m, n, k = mnk
     current_platform.seed_everything(7)
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
index b49319a7e6f5..d78b8250463a 100644
--- a/tests/kernels/moe/test_deepep_moe.py
+++ b/tests/kernels/moe/test_deepep_moe.py
@@ -294,7 +294,7 @@ def torch_moe_impl(
         # blockwise quant and de-quant.
         assert not per_act_token_quant
         a = test_tensors.rank_tokens
-        aq, aq_scale = per_token_group_quant_fp8(a, 128)
+        aq, aq_scale = per_token_group_quant_fp8(a, 128, use_ue8m0=False)
         a = (
             (aq.view(-1, 128).to(torch.float32) * aq_scale.view(-1, 1))
             .view(a.shape)
diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py
index 707068b2bbdc..a6977f222408 100644
--- a/tests/kernels/moe/test_flashinfer.py
+++ b/tests/kernels/moe/test_flashinfer.py
@@ -11,7 +11,6 @@
     fp8_w8a8_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
     apply_flashinfer_per_tensor_scale_fp8,
     flashinfer_cutlass_moe_fp8,
@@ -22,7 +21,14 @@
 from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8
 from vllm.model_executor.models.llama4 import Llama4MoE
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+
+try:
+    from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+except ImportError:
+    if current_platform.is_rocm():
+        pytest.skip(
+            "flashinfer not supported for vLLM on ROCm", allow_module_level=True
+        )
 
 if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability(
     90
@@ -45,8 +51,6 @@
 ]
 
 vllm_config = VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def quant_fp8_per_tensor_batches(a):
@@ -79,10 +83,14 @@ class TestData:
 
     @staticmethod
     def make_moe_tensors_8bit(
-        m: int, k: int, n: int, e: int, reorder: bool
+        m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu"
     ) -> "TestData":
+        is_gated = activation != "relu2_no_mul"
+
         hidden_states = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
-        w13 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16)
+        w13 = torch.randn(
+            (e, (2 * n) if is_gated else n, k), device="cuda", dtype=torch.bfloat16
+        )
         w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16)
 
         # Scale to fp8
@@ -142,14 +150,11 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
         td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True)
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
@@ -192,28 +197,29 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph(
 @pytest.mark.parametrize("m,n,k", MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("activation", ["silu", "relu2_no_mul"])
 def test_flashinfer_cutlass_moe_fp8_no_graph(
     m: int,
     n: int,
     k: int,
     e: int,
     topk: int,
+    activation: str,
     monkeypatch,
 ):
     current_platform.seed_everything(7)
     monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192")
     with set_current_vllm_config(vllm_config):
-        td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=False)
+        td = TestData.make_moe_tensors_8bit(
+            m, k, n, e, reorder=False, activation=activation
+        )
 
         score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids = Llama4MoE.custom_routing_function(
             hidden_states=td.hidden_states,
-            router_logits=score,
-            use_grouped_topk=False,
-            top_k=topk,
+            gating_output=score,
+            topk=topk,
             renormalize=False,
-            custom_routing_function=Llama4MoE.custom_routing_function,
-            scoring_func="softmax",
         )
 
         quant_config = fp8_w8a8_moe_quant_config(
@@ -235,7 +241,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
             topk_weights=topk_weights,
             topk_ids=topk_ids,
             inplace=False,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
@@ -255,7 +261,7 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
             td.layer,
             topk_weights,
             topk_ids,
-            activation="silu",
+            activation=activation,
             global_num_experts=e,
             expert_map=None,
             apply_router_weight_on_input=True,
diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py
index be3e36865d1a..b2be03ecee2f 100644
--- a/tests/kernels/moe/test_flashinfer_moe.py
+++ b/tests/kernels/moe/test_flashinfer_moe.py
@@ -16,11 +16,11 @@
     FlashInferExperts,
     is_valid_flashinfer_cutlass_fused_moe,
 )
+from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (
+    create_flashinfer_prepare_finalize,
+)
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
-from vllm.model_executor.layers.fused_moe.prepare_finalize import (
-    MoEPrepareAndFinalizeNoEP,
-)
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
 
@@ -48,9 +48,10 @@
 @pytest.mark.parametrize("e", [40, 64, 256])
 @pytest.mark.parametrize("topk", [1, 6, 8])
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("activation", ["silu_and_mul", "relu2"])
 @torch.inference_mode()
 def test_flashinfer_fp4_moe_no_graph(
-    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype
+    m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, activation: str
 ):
     current_platform.seed_everything(7)
     with set_current_vllm_config(
@@ -59,6 +60,7 @@ def test_flashinfer_fp4_moe_no_graph(
         a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
 
         quant_blocksize = 16
+        is_gated_act = activation == "silu_and_mul"
 
         w1_q, w2_q, quant_config = make_test_quant_config(
             e,
@@ -68,6 +70,7 @@ def test_flashinfer_fp4_moe_no_graph(
             quant_dtype="nvfp4",
             block_shape=None,
             per_act_token_quant=False,
+            make_gate=is_gated_act,
         )
 
         score = torch.randn((m, e), device="cuda", dtype=dtype)
@@ -76,16 +79,19 @@ def test_flashinfer_fp4_moe_no_graph(
         assert is_valid_flashinfer_cutlass_fused_moe(a, w1_q, w2_q)
 
         flashinfer_experts = FusedMoEModularKernel(
-            MoEPrepareAndFinalizeNoEP(),
+            create_flashinfer_prepare_finalize(use_dp=False, use_nvfp4=True),
             FlashInferExperts(out_dtype=dtype, quant_config=quant_config),
         )
 
+        fi_activation = {"silu_and_mul": "silu", "relu2": "relu2_no_mul"}[activation]
+
         flashinfer_output = flashinfer_experts(
             hidden_states=a,
             w1=w1_q,
             w2=w2_q,
             topk_weights=topk_weights,
             topk_ids=topk_ids,
+            activation=fi_activation,
         )
 
         # Reference check:
@@ -103,7 +109,9 @@ def test_flashinfer_fp4_moe_no_graph(
             block_size=quant_blocksize,
         )
 
-        w1_d = torch.empty((e, 2 * n, k), device="cuda", dtype=dtype)
+        w1_d = torch.empty(
+            (e, (2 if is_gated_act else 1) * n, k), device="cuda", dtype=dtype
+        )
         w2_d = torch.empty((e, k, n), device="cuda", dtype=dtype)
 
         for idx in range(0, e):
@@ -124,7 +132,9 @@ def test_flashinfer_fp4_moe_no_graph(
                 block_size=quant_blocksize,
             )
 
-        torch_output = torch_moe(a_in_dtype, w1_d, w2_d, score, topk)
+        torch_output = torch_moe(
+            a_in_dtype, w1_d, w2_d, score, topk, activation=activation
+        )
 
         torch.testing.assert_close(
             torch_output, flashinfer_output, atol=1e-1, rtol=1e-1
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index dfd317bcf72f..98e80ec02977 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -201,7 +201,7 @@ class ModelConfig:
     sliding_window: int = 128
     initial_context_length: int = 4096
     rope_theta: float = 150000.0
-    rope_scaling_factor: float = 32.0
+    rope_parameters_factor: float = 32.0
     rope_ntk_alpha: float = 1.0
     rope_ntk_beta: float = 32.0
 
@@ -270,6 +270,11 @@ class Case:
 @pytest.mark.parametrize("num_token", [2])
 @pytest.mark.parametrize("tp", [1, 2, 4, 8])
 def test_equiv(num_token, a_dtype, w_dtype, tp):
+    from triton_kernels.tensor_details import layout
+
+    if not hasattr(layout, "make_default_matmul_mxfp4_w_layout"):
+        pytest.skip("make_default_matmul_mxfp4_w_layout not available")
+
     M = num_token
     E = ModelConfig.num_experts
     K = ModelConfig.hidden_size
diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py
index e3b8621b452f..2a30ef235552 100644
--- a/tests/kernels/moe/test_modular_kernel_combinations.py
+++ b/tests/kernels/moe/test_modular_kernel_combinations.py
@@ -46,6 +46,12 @@
     reason="Requires deep_ep or deep_gemm or pplx or flashinfer packages",
 )
 
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
+
 
 def format_result(verbose, msg, ex=None):
     if ex is not None:
diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py
new file mode 100644
index 000000000000..c8616f13bbf8
--- /dev/null
+++ b/tests/kernels/moe/test_modular_oai_triton_moe.py
@@ -0,0 +1,249 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Test modular OAI Triton MoE
+"""
+
+import pytest
+import torch
+
+from vllm.utils.import_utils import has_triton_kernels
+
+if not has_triton_kernels():
+    pytest.skip(
+        "triton_kernels not found, skipping all related tests",
+        allow_module_level=True,
+    )
+
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, upcast_from_mxfp
+from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+from triton_kernels.tensor_details import layout
+from triton_kernels.testing import assert_close
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    OAITritonExperts,
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+from vllm.model_executor.layers.utils import shuffle_weight
+from vllm.platforms import current_platform
+
+MNK = [
+    (1, 512, 384),
+    (1, 2880, 2880),
+    (2, 512, 384),
+    (2, 2880, 2880),
+    (16, 2880, 2880),
+]
+
+
+def unshuffle_weight(w: torch.Tensor):
+    first = w[..., ::2]
+    second = w[..., 1::2]
+    return torch.concat((first, second), dim=-1)
+
+
+def make_weights(dtype, k, n, e):
+    w1 = torch.randn((e, k, 2 * n), dtype=dtype, device="cuda")
+    w1_bias = torch.randn((e, 2 * n), dtype=dtype, device="cuda")
+
+    w2 = torch.randn((e, n, k), dtype=dtype, device="cuda")
+    w2_bias = torch.randn((e, k), dtype=dtype, device="cuda")
+
+    w1_tri = w1.clone()
+    w2_tri = w2.clone()
+
+    w1_bias_tri = w1_bias.clone()
+    w2_bias_tri = w2_bias.clone()
+    w1_bias_tri = w1_bias_tri.to(torch.float32)
+    w2_bias_tri = w2_bias_tri.to(torch.float32)
+
+    # shuffle weights
+    w1_tri = shuffle_weight(w1_tri)
+    w1_bias_tri = shuffle_weight(w1_bias_tri)
+
+    # quant triton_weights
+    w1_tri, w1_scale_tri = downcast_to_mxfp(w1_tri, torch.uint8, axis=1)
+    w1 = upcast_from_mxfp(w1_tri, w1_scale_tri, dtype, axis=1)
+    w1 = unshuffle_weight(w1)
+
+    w2_tri, w2_scale_tri = downcast_to_mxfp(w2_tri, torch.uint8, axis=1)
+    w2 = upcast_from_mxfp(w2_tri, w2_scale_tri, dtype, axis=1)
+
+    num_warps = 8
+    w_layout, w_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+    w_scale_layout, w_scale_layout_opts = (
+        layout.make_default_matmul_mxfp4_w_scale_layout(mx_axis=1, num_warps=num_warps)
+    )
+
+    w1_tri = convert_layout(wrap_torch_tensor(w1_tri, FP4), w_layout, **w_layout_opts)
+    w1_scale_tri = convert_layout(
+        wrap_torch_tensor(w1_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w2_tri = convert_layout(wrap_torch_tensor(w2_tri, FP4), w_layout, **w_layout_opts)
+    w2_scale_tri = convert_layout(
+        wrap_torch_tensor(w2_scale_tri),
+        w_scale_layout,
+        **w_scale_layout_opts,
+    )
+
+    w1_precision_config = PrecisionConfig(
+        weight_scale=w1_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+    w2_precision_config = PrecisionConfig(
+        weight_scale=w2_scale_tri, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+
+    return (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    )
+
+
+def swiglu(x, alpha: float = 1.702, limit: float = 1.0):
+    # Note we add an extra bias of 1 to the linear layer
+    x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+    if limit is not None:
+        x_glu = x_glu.clamp(max=limit)
+    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
+    if limit is not None:
+        x_linear = x_linear.clamp(min=-limit, max=limit)
+    return out_glu * (x_linear + 1)
+
+
+def torch_moe_impl(
+    hidden_states: torch.Tensor,  # (M, K)
+    w1: torch.Tensor,  # (E, K, 2N)
+    w2: torch.Tensor,  # (E, N, K)
+    w1_bias: torch.Tensor,  # (E, 2N)
+    w2_bias: torch.Tensor,  # (E, K)
+    topk_weights: torch.Tensor,  # (M, topk)
+    topk_ids: torch.Tensor,  # (M, topk)
+):
+    w1 = w1[topk_ids, ...]
+    w1_bias = w1_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bk->bec", w1, hidden_states) + w1_bias
+    hidden_states = swiglu(hidden_states, limit=7)
+
+    w2 = w2[topk_ids, ...]
+    w2_bias = w2_bias[topk_ids, ...]
+    hidden_states = torch.einsum("bekc,bek->bec", w2, hidden_states) + w2_bias
+
+    # Weighted sum of experts
+    hidden_states = torch.einsum("bec,be->bc", hidden_states, topk_weights)
+    return hidden_states
+
+
+def oai_triton_moe_impl(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    w1_scale: "PrecisionConfig",
+    w2_scale: "PrecisionConfig",
+    w1_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+    num_experts: int,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    unfused: bool = False,
+) -> torch.Tensor:
+    quant_config = mxfp4_w4a16_moe_quant_config(
+        w1_bias=w1_bias,
+        w2_bias=w2_bias,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+    )
+
+    if unfused:
+        fused_experts = UnfusedOAITritonExperts(quant_config)
+    else:
+        fused_experts = OAITritonExperts(quant_config)
+
+    mk = FusedMoEModularKernel(MoEPrepareAndFinalizeNoEP(), fused_experts)
+
+    return mk.forward(
+        hidden_states=x,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=True,
+        activation="swigluoai",
+        global_num_experts=num_experts,
+        expert_map=None,
+        apply_router_weight_on_input=False,
+    )
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@pytest.mark.parametrize("m,n,k", MNK)
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("topk", [4])
+@pytest.mark.parametrize("unfused", [True, False])
+def test_oai_triton_moe(
+    dtype: torch.dtype,
+    m: int,
+    n: int,
+    k: int,
+    num_experts: int,
+    topk: int,
+    unfused: bool,
+):
+    current_platform.seed_everything(0)
+    (
+        w1,
+        w2,
+        w1_bias,
+        w2_bias,
+        w1_tri,
+        w2_tri,
+        w1_bias_tri,
+        w2_bias_tri,
+        w1_precision_config,
+        w2_precision_config,
+    ) = make_weights(dtype, k, n, num_experts)
+
+    x = torch.randn((m, k), dtype=dtype, device="cuda")
+    router_logits = torch.randn(m, num_experts, device="cuda", dtype=dtype)
+    topk_weights, topk_ids = torch.topk(router_logits, k=topk, dim=-1, sorted=True)
+    topk_weights = torch.nn.functional.softmax(topk_weights, dim=-1)
+
+    with set_current_vllm_config(VllmConfig()):
+        out_ref = torch_moe_impl(x, w1, w2, w1_bias, w2_bias, topk_weights, topk_ids)
+
+        out = oai_triton_moe_impl(
+            x,
+            w1_tri,
+            w2_tri,
+            w1_precision_config,
+            w2_precision_config,
+            w1_bias_tri,
+            w2_bias_tri,
+            num_experts,
+            topk_weights,
+            topk_ids,
+            unfused,
+        )
+
+    assert_close(ref=out_ref, tri=out, maxtol=0.025, rmstol=0.005)
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index c27cf2468ede..bacf6f37f2b0 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -21,7 +21,7 @@
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.moe.utils import fused_moe
-from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
+from tests.kernels.utils import opcheck, stack_and_dev, torch_experts, torch_moe
 from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.distributed.parallel_state import init_distributed_environment
@@ -65,6 +65,64 @@
 EP_SIZE = [1, 4]
 TOP_KS = [2, 6]
 
+MOE_MARLIN_QUANT_TEST_CONFIGS = [
+    # AWQ-INT4
+    {"b_type": scalar_types.uint4, "group_blocks": [-1, 2, 4, 8]},
+    # GPTQ-INT4
+    {
+        "b_type": scalar_types.uint4b8,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": scalar_types.uint8b128,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # FP8
+    {"b_type": scalar_types.float8_e4m3fn, "group_blocks": [-1, 8]},
+    # NVFP4
+    {"b_type": scalar_types.float4_e2m1f, "group_blocks": [1]},
+    # MXFP4
+    {
+        "a_type": [scalar_types.bfloat16],
+        "b_type": scalar_types.float4_e2m1f,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.float4_e2m1f,
+        "c_type": [scalar_types.bfloat16],
+        "group_blocks": [2],
+    },
+]
+
 FUSED_MOE_MNK_FACTORS = [
     (1, 128, 128),
     (1, 2048, 128),
@@ -81,8 +139,6 @@
 ]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def run_moe_test(
@@ -507,63 +563,74 @@ def marlin_moe_generate_valid_test_cases():
     m_list = [1, 123, 666]
     n_list = [128, 1024]
     k_list = [256, 2048]
-    e_list = [4, 12]
+    e_list = [5, 12]
     topk_list = [2, 3]
     ep_size_list = [1, 4]
-    dtype_list = [torch.bfloat16]
-    group_size_list = [-1, 32, 128]
     act_order_list = [True, False]
-    quant_type_list = [
-        scalar_types.float4_e2m1f,
-        scalar_types.float8_e4m3fn,
-        scalar_types.uint4,
-        scalar_types.uint4b8,
-        scalar_types.uint8b128,
-    ]
     is_k_full_list = [True, False]
 
     all_combinations = itertools.product(
+        MOE_MARLIN_QUANT_TEST_CONFIGS,
         m_list,
         n_list,
         k_list,
         e_list,
         topk_list,
         ep_size_list,
-        dtype_list,
-        group_size_list,
         act_order_list,
-        quant_type_list,
         is_k_full_list,
     )
 
     def is_invalid(
-        m, n, k, e, topk, ep_size, dtype, group_size, act_order, quant_type, is_k_full
+        a_type,
+        b_type,
+        c_type,
+        group_blocks,
+        m,
+        n,
+        k,
+        e,
+        topk,
+        ep_size,
+        act_order,
+        is_k_full,
     ):
-        if quant_type == scalar_types.float8_e4m3fn and group_size not in [-1, 128]:
-            return False
-        if quant_type == scalar_types.float4_e2m1f:
-            if group_size not in [16, 32]:
-                return False
-            if dtype == torch.float16 and group_size == 32:
-                return False
-        if quant_type != scalar_types.float4_e2m1f and group_size == 16:
+        group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+        if group_size > 0 and k % group_size != 0:
             return False
 
-        # Filter act_order
-        if act_order:
-            if group_size in (-1, k, n):
-                return False
-            if quant_type not in [scalar_types.uint4b8]:
-                return False
-        elif not is_k_full:
+        if act_order and group_size in [-1, k, n]:
+            return False
+        if group_size in [k, n]:
+            return False
+        if not act_order and is_k_full:
             return False
 
-        return True
+        return a_type.size_bits < 16 or a_type is c_type
 
     cases = []
     for case in all_combinations:
-        if is_invalid(*case):
-            cases.append(case)
+        quant_test_config, m, n, k, _, _, _, act_order, *_ = case
+        if act_order and not quant_test_config.get("support_act_order", False):
+            continue
+
+        f16_types = [scalar_types.float16]
+        inner_combinations = itertools.product(
+            quant_test_config.get("a_type", f16_types),
+            [quant_test_config["b_type"]],
+            quant_test_config.get("c_type", f16_types),
+            quant_test_config["group_blocks"],
+        )
+
+        for sub_case in inner_combinations:
+            if (
+                sub_case[0] == scalar_types.float8_e4m3fn
+                and current_platform.get_device_capability() not in [89, 120]
+            ):
+                continue
+            args = sub_case + (m, n, k) + case[4:]
+            if is_invalid(*args):
+                cases.append(args)
     return cases
 
 
@@ -573,6 +640,7 @@ class MarlinMoEWeightData:
     qweight: torch.Tensor
     scales: torch.Tensor
     global_scale: torch.Tensor | None
+    a_scales_factor: torch.Tensor | None
     g_idx: torch.Tensor | None
     zeros: torch.Tensor | None
     sort_indices: torch.Tensor | None
@@ -585,11 +653,20 @@ def make(
         group_size: int,
         act_order: bool | None = None,
         bias: torch.Tensor | None = None,
+        input_type: ScalarType = None,
     ) -> "MarlinMoEWeightData":
         assert w.ndim == 3
+
         has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
         k = w.shape[-1]
 
+        if input_type == scalar_types.int8:
+            input_dtype = torch.int8
+        elif input_type == scalar_types.float8_e4m3fn:
+            input_dtype = torch.float8_e4m3fn
+        else:
+            input_dtype = w.dtype
+
         w_ref_l: list[torch.Tensor] = []
         qweight_l: list[torch.Tensor] = []
         scales_l: list[torch.Tensor] = []
@@ -603,11 +680,13 @@ def make(
             if quant_type == scalar_types.float4_e2m1f:
                 if group_size == 16:
                     w_ref, qweight, scales, global_scale = (
-                        rand_marlin_weight_nvfp4_like(w[i], group_size)
+                        rand_marlin_weight_nvfp4_like(
+                            w[i], group_size, input_dtype=input_dtype
+                        )
                     )
                 else:
                     w_ref, qweight, scales = rand_marlin_weight_mxfp4_like(
-                        w[i], group_size
+                        w[i], group_size, input_dtype=input_dtype
                     )
                     global_scale = None
 
@@ -617,13 +696,18 @@ def make(
                 if global_scale is not None:
                     global_scale_l.append(global_scale)
             elif quant_type == scalar_types.float8_e4m3fn:
-                w_ref, qweight, scales = marlin_quant_fp8_torch(w[i], group_size)
+                w_ref, qweight, scales = marlin_quant_fp8_torch(
+                    w[i], group_size, input_dtype=input_dtype
+                )
                 w_ref_l.append(w_ref.T)
                 qweight_l.append(qweight)
                 scales_l.append(scales)
             elif has_zp:
                 w_ref, qweight, scales, zeros = awq_marlin_quantize(
-                    w[i].transpose(1, 0), quant_type, group_size
+                    w[i].transpose(1, 0),
+                    quant_type,
+                    group_size,
+                    input_dtype=input_dtype,
                 )
 
                 w_ref_l.append(w_ref.T)
@@ -633,7 +717,12 @@ def make(
             else:
                 test_perm = torch.randperm(k)
                 w_ref, qweight, scales, g_idx, sort_indices, _ = marlin_quantize(
-                    w[i].transpose(1, 0), quant_type, group_size, act_order, test_perm
+                    w[i].transpose(1, 0),
+                    quant_type,
+                    group_size,
+                    act_order,
+                    test_perm,
+                    input_dtype=input_dtype,
                 )
 
                 w_ref_l.append(w_ref.T)
@@ -654,11 +743,18 @@ def make(
         sort_indices = stack_and_dev(sort_indices_l) if sort_indices_l else None
         marlin_bias = stack_and_dev(bias_l) if bias_l else None
 
+        a_scales_factor = None
+        if input_type == scalar_types.int8 and group_size != -1:
+            a_scales_factor = 1 / 4096 * scales.max().float()
+            scales = scales / scales.max() * 4096
+            scales = scales.round().to(torch.int16).view(w.dtype)
+
         return MarlinMoEWeightData(
             w_ref=w_ref,
             qweight=qweight,
             scales=scales,
             global_scale=global_scale,
+            a_scales_factor=a_scales_factor,
             g_idx=g_idx,
             zeros=zeros,
             sort_indices=sort_indices,
@@ -668,28 +764,47 @@ def make(
 
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.parametrize(
-    ("m, n, k, e, topk, ep_size, dtype, group_size,act_order, quant_type, is_k_full"),
+    (
+        "a_type, b_type, c_type, group_blocks,"
+        "m, n, k, e, topk, ep_size, act_order, is_k_full"
+    ),
     marlin_moe_generate_valid_test_cases(),
 )
 @pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
 def test_fused_marlin_moe(
-    m: int,
-    n: int,
-    k: int,
-    e: int,
-    topk: int,
-    ep_size: int,
-    dtype: torch.dtype,
-    group_size: int,
-    act_order: bool,
-    quant_type: ScalarType,
-    is_k_full: bool,
+    a_type,
+    b_type,
+    c_type,
+    group_blocks,
+    m,
+    n,
+    k,
+    e,
+    topk,
+    ep_size,
+    act_order,
+    is_k_full,
 ):
-    torch.cuda.manual_seed(0)
+    torch.cuda.manual_seed(1)
+    group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+
+    if c_type == scalar_types.float16:
+        dtype = torch.float16
+    elif c_type == scalar_types.bfloat16:
+        dtype = torch.bfloat16
+    else:
+        raise RuntimeError("unsupported c_type")
+
+    if a_type == scalar_types.int8:
+        a_dtype = torch.int8
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_dtype = torch.float8_e4m3fn
+    else:
+        a_dtype = dtype
 
     a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 20
-    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 20
+    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10
 
     if ep_size > 1:
         local_e = e // ep_size
@@ -702,11 +817,19 @@ def test_fused_marlin_moe(
         e_map = None
 
     w1_data = MarlinMoEWeightData.make(
-        w=w1, quant_type=quant_type, group_size=group_size, act_order=act_order
+        w=w1,
+        quant_type=b_type,
+        group_size=group_size,
+        act_order=act_order,
+        input_type=a_type,
     )
 
     w2_data = MarlinMoEWeightData.make(
-        w=w2, quant_type=quant_type, group_size=group_size, act_order=act_order
+        w=w2,
+        quant_type=b_type,
+        group_size=group_size,
+        act_order=act_order,
+        input_type=a_type,
     )
 
     score = torch.randn((m, e), device="cuda", dtype=dtype)
@@ -714,8 +837,18 @@ def test_fused_marlin_moe(
     topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
 
     with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(
-            a, w1_data.w_ref, w2_data.w_ref, score, topk, expert_map=e_map
+        score = torch.softmax(score, dim=-1, dtype=torch.float32)
+        topk_weight, topk_ids = torch.topk(score, topk)
+        torch_output = torch_experts(
+            a,
+            w1_data.w_ref,
+            w2_data.w_ref,
+            topk_weight=topk_weight,
+            topk_ids=topk_ids,
+            global_num_experts=e,
+            expert_map=e_map,
+            quant_dtype=a_dtype,
+            per_act_token_quant=True,
         )
 
     marlin_output = fused_marlin_moe(
@@ -735,15 +868,18 @@ def test_fused_marlin_moe(
         global_scale2=w2_data.global_scale,
         g_idx1=w1_data.g_idx,
         g_idx2=w2_data.g_idx,
+        input_global_scale1=w1_data.a_scales_factor,
+        input_global_scale2=w2_data.a_scales_factor,
         sort_indices1=w1_data.sort_indices,
         sort_indices2=w2_data.sort_indices,
         w1_zeros=w1_data.zeros,
         w2_zeros=w2_data.zeros,
-        quant_type_id=quant_type.id,
+        input_dtype=a_dtype,
+        quant_type_id=b_type.id,
         is_k_full=is_k_full,
     )
 
-    torch.testing.assert_close(marlin_output, torch_output, atol=5e-2, rtol=0)
+    torch.testing.assert_close(marlin_output, torch_output, atol=4e-2, rtol=0)
 
 
 @pytest.mark.flaky(reruns=2)
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index ba1f657b3ecd..12dd322dccc5 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -23,6 +23,12 @@
 EP_SIZE = [1, 4, 16]
 current_platform.seed_everything(0)
 
+if current_platform.is_rocm():
+    pytest.skip(
+        "moe_permute_unpermute_supported is not defined for ROCm",
+        allow_module_level=True,
+    )
+
 
 def torch_permute(
     hidden_states: torch.Tensor,
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
index a2de64974b35..dd4eb4da913b 100644
--- a/tests/kernels/moe/test_pplx_cutlass_moe.py
+++ b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -192,8 +192,6 @@ def pplx_cutlass_moe(
 
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def _pplx_moe(
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
index 0f0ed3326d15..f671b23d300c 100644
--- a/tests/kernels/moe/test_pplx_moe.py
+++ b/tests/kernels/moe/test_pplx_moe.py
@@ -81,8 +81,6 @@
 DTYPES = [torch.float8_e4m3fn, torch.bfloat16]
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 
 def torch_prepare(
diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
index 420dbbffaac0..b220205759e2 100644
--- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
+++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
+import random
+
 import pytest
 import torch
 
@@ -8,27 +11,36 @@
     persistent_masked_m_silu_mul_quant,
 )
 from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
+from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm
+from vllm.utils.math_utils import cdiv, round_up
+
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
 
 fp8_dtype = torch.float8_e4m3fn
 
 CASES = [
     (1, 1, 128, fp8_dtype),
-    (1, 4, 128, fp8_dtype),
-    (2, 4, 256, fp8_dtype),
-    (32, 64, 256, fp8_dtype),
-    (17, 31, 768, fp8_dtype),
-    (1, 1, 128 * 1, fp8_dtype),
-    (1, 1, 128 * 3, fp8_dtype),
-    (1, 1, 128 * 4, fp8_dtype),
-    (8, 16, 128 * 1, fp8_dtype),
-    (8, 16, 128 * 2, fp8_dtype),
-    (8, 16, 128 * 3, fp8_dtype),
+    (1, 4, 128 * 1, fp8_dtype),
+    (2, 4, 128 * 2, fp8_dtype),
+    (1, 4, 128 * 3, fp8_dtype),
+    (8, 16, 128 * 4, fp8_dtype),
+    (8, 16, 128 * 5, fp8_dtype),
+    (8, 16, 128 * 6, fp8_dtype),
+    (8, 16, 128 * 7, fp8_dtype),
+    (8, 16, 128 * 8, fp8_dtype),
+    (8, 16, 128 * 9, fp8_dtype),
     (8, 64, 7168, fp8_dtype),
     (8, 128, 128 * 33, fp8_dtype),
+    (1, 4, 128 * 10, fp8_dtype),
     (8, 128, 7168, fp8_dtype),
     (8, 512, 7168, fp8_dtype),
     (8, 1024, 7168, fp8_dtype),
+    (17, 31, 768, fp8_dtype),
+    (32, 64, 256, fp8_dtype),
     (256, 8, 7168, fp8_dtype),
     (256, 32, 7168, fp8_dtype),
     (256, 64, 7168, fp8_dtype),
@@ -38,14 +50,159 @@
 ]
 
 
+def as_uint8(x) -> torch.Tensor:
+    return (
+        torch.empty(x.shape, dtype=x.dtype, device=x.device).copy_(x).view(torch.uint8)
+    )
+
+
+def silu(x: torch.Tensor) -> torch.Tensor:
+    one_f32 = torch.tensor([1.0], device=x.device, dtype=torch.float32)
+    x_f32 = x.to(torch.float32)
+    act_f32 = x_f32 / (one_f32 + torch.exp(-x_f32))
+    assert act_f32.dtype == torch.float32
+    return act_f32.to(torch.bfloat16)
+
+
+def do_quant(x: torch.Tensor, group_size: int, ceil_ue8m0: bool):
+    eps_bf16 = torch.tensor([1e-10], device=x.device, dtype=torch.bfloat16)
+    one_bf16 = torch.tensor([1.0], device=x.device, dtype=torch.bfloat16)
+    fp8_max_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).max], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_min_bf16 = torch.tensor(
+        [torch.finfo(fp8_dtype).min], device=x.device, dtype=torch.bfloat16
+    )
+    fp8_max_inv = one_bf16 / fp8_max_bf16
+    assert fp8_max_inv.dtype == torch.bfloat16
+
+    assert x.size(-1) % group_size == 0
+    num_groups = x.numel() // group_size
+    x_og_shape = x.shape
+
+    x = x.to(torch.bfloat16)
+    x = x.view((-1, group_size))
+    amax = x.abs().amax(dim=1).clamp(min=eps_bf16)
+    assert amax.dtype == torch.bfloat16
+    s = amax * fp8_max_inv
+
+    if ceil_ue8m0:
+        s = torch.exp2(
+            torch.ceil(torch.log2(s).to(torch.bfloat16)).to(torch.bfloat16)
+        ).to(torch.bfloat16)
+
+    inv_s = one_bf16 / s
+    inv_s = inv_s.view((num_groups, 1))
+    xq = torch.clamp(x * inv_s, min=fp8_min_bf16.item(), max=fp8_max_bf16.item()).to(
+        fp8_dtype
+    )
+
+    xq = xq.view(x_og_shape)
+    xs = s.view((-1, xq.size(-1) // group_size))
+    return xq, xs
+
+
+def silu_mul_quant(
+    gate: torch.Tensor, up: torch.Tensor, group_size: int, ceil_ue8m0: bool
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert gate.size(-1) % group_size == 0
+    assert up.size(-1) % group_size == 0
+
+    assert gate.dtype == torch.bfloat16
+    assert up.dtype == torch.bfloat16
+
+    act_bf16 = silu(gate)
+    assert act_bf16.dtype == torch.bfloat16
+
+    # act & mul
+    a_m = act_bf16 * up
+    assert a_m.dtype == torch.bfloat16
+
+    q, s = do_quant(a_m, group_size, ceil_ue8m0)
+    return q, s
+
+
+def pack_scales(x: torch.Tensor, tokens_per_expert: torch.Tensor) -> torch.Tensor:
+    """
+    pack float32 scales into a int32 tensor
+    """
+    assert x.dtype == torch.float32
+    E, T, G = x.size()
+
+    # Add i32_padding here so we can view it as a i32 tensor later on.
+    i32_padding = round_up(G, 4) - G
+    ref_s_i8 = torch.empty((E, T, G + i32_padding), dtype=torch.uint8, device="cuda")
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        ref_s_i8[e, :nt, :G] = x[e, :nt].view(torch.int32) >> 23
+
+    ref_s_i32 = ref_s_i8.view(torch.int32)
+
+    return ref_s_i32
+
+
+def ref_with_scale_fmt(
+    E: int,
+    T: int,
+    H: int,
+    group_size: int,
+    tokens_per_expert: torch.Tensor,
+    gate: torch.Tensor,
+    up: torch.Tensor,
+    scale_fmt: DeepGemmQuantScaleFMT,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    The precision types of the operations triggered by this function
+    match closely with the kernel implementation so we compare more
+    accurately.
+    """
+    scale_dtype = (
+        torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
+    )
+    ceil_ue8m0 = scale_fmt in [
+        DeepGemmQuantScaleFMT.UE8M0,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]
+
+    ref_q = torch.empty((E, T, H), dtype=fp8_dtype, device="cuda")
+    ref_s_f32 = torch.empty(
+        (E, T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    )
+
+    for e in range(E):
+        nt = tokens_per_expert[e].item()
+        if nt == 0:
+            continue
+        ref_q[e, :nt], ref_s_f32[e, :nt] = silu_mul_quant(
+            gate[e, :nt], up[e, :nt], group_size, ceil_ue8m0=ceil_ue8m0
+        )
+
+    if scale_dtype == torch.float32:
+        return ref_q, ref_s_f32
+
+    assert scale_dtype == torch.int32
+    return ref_q, pack_scales(ref_s_f32, tokens_per_expert)
+
+
+def token_random(E, T, H2, tokens_per_expert):
+    """
+    Initialize each token in a random range so we test a range of
+    scale values.
+    """
+    y = torch.empty((E, T, H2), dtype=torch.bfloat16, device="cuda")
+    for e in range(E):
+        for t in range(tokens_per_expert[e].item()):
+            exp = random.choice(range(1, 20))
+            y[e, t].uniform_(-(2**exp), 2**exp)
+    return y
+
+
 @pytest.mark.parametrize("E,T,H,fp8_type", CASES)
 @torch.inference_mode()
-def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
+def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype):
     group_size = 128
     current_platform.seed_everything(42)
 
-    # Input tensor of shape (E, T, 2*H)
-    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
     tokens_per_expert = torch.randint(
         low=0,
         high=T,
@@ -54,71 +211,83 @@ def test_silu_mul_fp8_quant_deep_gemm(E, T, H, fp8_type):
         device="cuda",
     )
 
-    # Run the SiLU V2 kernel
-    # TODO (varun): use_e8m0 is set to false as the reference impl does
-    # not handle that case.
-    y_q, y_s = persistent_masked_m_silu_mul_quant(
-        y, tokens_per_expert, group_size=group_size, use_ue8m0=False
-    )
+    # Input tensor of shape (E, T, 2*H)
+    y = token_random(E, T, 2 * H, tokens_per_expert)
 
-    torch.cuda.synchronize()
-    fp8_info = torch.finfo(fp8_dtype)
-    fp8_max = fp8_info.max
-    fp8_min = fp8_info.min
-    eps = 1e-10
+    gate = y[..., :H].to(torch.bfloat16)
+    up = y[..., H:].to(torch.bfloat16)
 
-    y1 = y[..., :H].float()
-    y2 = y[..., H:]
-    silu_x = y1 * torch.sigmoid(y1)
-    merged = silu_x * y2
+    scale_fmts = [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
-    for e in range(E):
-        nt = tokens_per_expert[e].item()
-        ref_s = torch.empty(
-            (T, cdiv(H, group_size)), dtype=torch.float32, device="cuda"
+    # Run the SiLU V2 kernel
+    for scale_fmt in scale_fmts:
+        y_q, y_s = persistent_masked_m_silu_mul_quant(
+            y,
+            tokens_per_expert,
+            group_size=group_size,
+            quant_scale_fmt=scale_fmt,
         )
-        ref_q = torch.empty((T, H), dtype=fp8_dtype, device="cuda")
 
-        for t in range(nt):
-            data = merged[e, t].float()
-            ref_q_row = torch.empty_like(data)
+        ref_y_q, ref_y_s = ref_with_scale_fmt(
+            E, T, H, group_size, tokens_per_expert, gate, up, scale_fmt=scale_fmt
+        )
 
-            # process full groups
-            n_full_groups = H // group_size
-            if n_full_groups > 0:
-                data_grp = data[: n_full_groups * group_size].view(
-                    n_full_groups, group_size
-                )
-                amax = data_grp.abs().amax(dim=1).clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data[: n_full_groups * group_size] / scale.repeat_interleave(
-                    group_size
-                )
-                ref_q_row[: n_full_groups * group_size] = scaled.clamp(
-                    fp8_min, fp8_max
-                ).to(fp8_dtype)
-                ref_s[t, :n_full_groups] = scale
-
-            # process remainder group
-            rem = H % group_size
-            if rem > 0:
-                data_rem = data[-rem:]
-                amax = data_rem.abs().amax().clamp(min=eps)
-                scale = amax / fp8_max
-                scaled = data_rem / scale
-                ref_q_row[-rem:] = scaled.clamp(fp8_min, fp8_max).to(fp8_dtype)
-                ref_s[t, -1] = scale
-
-            ref_q[t] = ref_q_row
-
-        y_se = y_s[e].float()
-        y_qe = y_q[e].float()
-
-        torch.testing.assert_close(
-            y_qe[:nt].to(torch.float32),
-            ref_q[:nt].to(torch.float32),
-            atol=2,
-            rtol=2e-1,
+        # deepgemm scales transform
+        dg_scales = None
+        if (
+            has_deep_gemm()
+            and current_platform.has_device_capability(100)
+            and scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+        ):
+            from deep_gemm import transform_sf_into_required_layout
+
+            _q, _s = ref_with_scale_fmt(
+                E,
+                T,
+                H,
+                group_size,
+                tokens_per_expert,
+                gate,
+                up,
+                scale_fmt=DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+            )
+            dg_scales = transform_sf_into_required_layout(
+                sf=_s,
+                mn=_q.size(1),
+                k=_q.size(2),
+                recipe=(1, 128, 128),
+                num_groups=_q.size(0),
+                is_sfa=True,
+            )
+
+        expected_scale_dtype = (
+            torch.int32 if scale_fmt == DeepGemmQuantScaleFMT.UE8M0 else torch.float32
         )
+        assert y_s.dtype == expected_scale_dtype
+        assert ref_y_s.dtype == expected_scale_dtype
+
+        for e in range(E):
+            nt = tokens_per_expert[e].item()
 
-        torch.testing.assert_close(y_se[:nt], ref_s[:nt], atol=1e-4, rtol=1e-2)
+            torch.testing.assert_close(
+                y_q[e, :nt].to(torch.float32),
+                ref_y_q[e, :nt].to(torch.float32),
+            )
+
+            if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
+                G = H // group_size
+                y_s_sliced = as_uint8(y_s[e])
+                ref_s_sliced = as_uint8(ref_y_s[e])
+                torch.testing.assert_close(y_s_sliced[:nt, :G], ref_s_sliced[:nt, :G])
+                if dg_scales is not None:
+                    dg_sliced = as_uint8(dg_scales[e])
+                    torch.testing.assert_close(y_s_sliced[:nt, :G], dg_sliced[:nt, :G])
+            else:
+                torch.testing.assert_close(
+                    y_s[e, :nt],
+                    ref_y_s[e, :nt],
+                )
diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
index 933cd9dbdeaa..0ab025dceca4 100644
--- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
+++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py
@@ -18,8 +18,12 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
+
+if current_platform.is_fp8_fnuz():
+    pytest.skip(
+        "Tests in this file require float8_e4m3fn and platform does not support",
+        allow_module_level=True,
+    )
 
 
 def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16):
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
index c7e6c4240e85..f0c8c8033b8e 100644
--- a/tests/kernels/moe/utils.py
+++ b/tests/kernels/moe/utils.py
@@ -264,13 +264,20 @@ def make_test_weights(
     quant_dtype: torch.dtype | str | None = None,
     block_shape: list[int] | None = None,
     per_out_ch_quant: bool = False,
+    make_gate: bool = True,
 ) -> tuple[
     tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
     tuple[torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None],
 ]:
     return (
         make_test_weight(
-            e, 2 * n, k, in_dtype, quant_dtype, block_shape, per_out_ch_quant
+            e,
+            (2 if make_gate else 1) * n,
+            k,
+            in_dtype,
+            quant_dtype,
+            block_shape,
+            per_out_ch_quant,
         ),
         make_test_weight(e, k, n, in_dtype, quant_dtype, block_shape, per_out_ch_quant),
     )
@@ -297,6 +304,7 @@ def make_test_quant_config(
     quant_dtype: torch.dtype | str | None = None,
     per_act_token_quant: bool = False,
     block_shape: list[int] | None = None,
+    make_gate: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor, FusedMoEQuantConfig]:
     (_, w1, w1_s, w1_gs), (_, w2, w2_s, w2_gs) = make_test_weights(
         e,
@@ -306,6 +314,7 @@ def make_test_quant_config(
         quant_dtype,
         per_out_ch_quant=per_act_token_quant,
         block_shape=block_shape,
+        make_gate=make_gate,
     )
 
     # Hacky/trivial scales for nvfp4.
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index 55f092e7ea69..d0e4f6554a91 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -22,6 +22,7 @@
     fp8_gemm_nt,
     get_col_major_tma_aligned_tensor,
     per_block_cast_to_fp8,
+    should_use_deepgemm_for_fp8_linear,
 )
 from vllm.utils.import_utils import has_deep_gemm
 
@@ -29,8 +30,6 @@
     pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
@@ -159,10 +158,6 @@ def test_w8a8_block_fp8_cutlass_matmul():
 @pytest.mark.skipif(not has_deep_gemm(), reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
-    # only aligned sizes
-    if M % 4 != 0 or K % 128 != 0 or N % 64 != 0:
-        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
-
     torch.manual_seed(seed)
     fp8_info = torch.finfo(torch.float8_e4m3fn)
     fp8_max = fp8_info.max
@@ -170,6 +165,12 @@ def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed):
     A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
     B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
 
+    # only aligned sizes are supported by deepgemm
+    if not should_use_deepgemm_for_fp8_linear(
+        output_dtype=out_dtype, weight=B_fp32, supports_deep_gemm=True
+    ):
+        pytest.skip(f"Skipping test; invalid size {M}, {N}, {K}")
+
     A_fp8, As_fp8 = per_token_group_quant_fp8(A_fp32, block_size[1])
     B_fp8, Bs_fp8 = per_block_cast_to_fp8(B_fp32, block_size=block_size)
 
diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py
index dabc10a122f7..310091b6a554 100644
--- a/tests/kernels/quantization/test_block_int8.py
+++ b/tests/kernels/quantization/test_block_int8.py
@@ -18,8 +18,6 @@
     pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True)
 
 vllm_config = VllmConfig()
-vllm_config.scheduler_config.max_num_seqs = 128
-vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.half, torch.bfloat16]
 M = [1, 33, 64, 222]
diff --git a/tests/kernels/quantization/test_marlin_gemm.py b/tests/kernels/quantization/test_marlin_gemm.py
index 0833115fcf30..59516db1b115 100644
--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -5,6 +5,8 @@
 Run `pytest tests/kernels/quantization/test_marlin_gemm.py`.
 """
 
+import itertools
+
 import pytest
 import torch
 
@@ -17,8 +19,10 @@
     GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES,
     GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES,
 )
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    MARLIN_SUPPORTED_GROUP_SIZES,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
     marlin_permute_bias,
@@ -26,7 +30,6 @@
     query_marlin_supported_quant_types,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    FP4_MARLIN_SUPPORTED_GROUP_SIZES,
     rand_marlin_weight_mxfp4_like,
     rand_marlin_weight_nvfp4_like,
 )
@@ -50,6 +53,7 @@
     quantize_weights,
     sort_weights,
 )
+from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 
 ACT_ORDER_OPTS = [False, True]
@@ -65,6 +69,12 @@
 
 HQQ_SUPPORTED_GROUP_SIZES = [64]
 
+MARLIN_REPACK_NK_FACTORS = [
+    (4, 8),
+    (7, 5),
+    (13, 11),
+]
+
 MNK_FACTORS = [
     (1, 1, 1),
     (1, 4, 8),
@@ -74,6 +84,64 @@
 
 DTYPES = [torch.float16, torch.bfloat16]
 
+DENSE_MARLIN_QUANT_TEST_CONFIGS = [
+    # AWQ-INT4
+    {"b_type": scalar_types.uint4, "group_blocks": [-1, 2, 4, 8]},
+    # GPTQ-INT4
+    {
+        "b_type": scalar_types.uint4b8,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT8
+    {
+        "b_type": scalar_types.uint8b128,
+        "support_act_order": True,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # FP8
+    {"b_type": scalar_types.float8_e4m3fn, "group_blocks": [-1, 8]},
+    # NVFP4
+    {"b_type": scalar_types.float4_e2m1f, "group_blocks": [1]},
+    # MXFP4
+    {
+        "a_type": [scalar_types.bfloat16],
+        "b_type": scalar_types.float4_e2m1f,
+        "group_blocks": [2],
+    },
+    # AWQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with INT8 activation
+    {
+        "a_type": [scalar_types.int8],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # GPTQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4b8,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # AWQ-INT4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.uint4,
+        "group_blocks": [-1, 2, 4, 8],
+    },
+    # MXFP4 with FP8 activation
+    {
+        "a_type": [scalar_types.float8_e4m3fn],
+        "b_type": scalar_types.float4_e2m1f,
+        "c_type": [scalar_types.bfloat16],
+        "group_blocks": [2],
+    },
+]
+
 
 def compute_max_diff(output, output_ref):
     return torch.mean(torch.abs(output - output_ref)) / torch.mean(
@@ -85,6 +153,58 @@ def rand_data(shape, dtype=torch.float16):
     return torch.randn(shape, dtype=dtype, device="cuda")
 
 
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
+def test_marlin_int4_fp8_preprocess_without_zp():
+    qweight_unpacked = torch.randint(
+        0, 16, size=(2048, 2048), dtype=torch.int32, device="cuda"
+    )
+    qweight_packed = qweight_unpacked[:, ::2] * 16 + qweight_unpacked[:, 1::2]
+    qweight_packed = qweight_packed.to(torch.int8).view(torch.int32)
+
+    cuda_res = ops.marlin_int4_fp8_preprocess(qweight_packed)
+
+    torch_res = torch.where(
+        qweight_unpacked >= 8, qweight_unpacked - 8, 15 - qweight_unpacked
+    )
+    torch_res = torch_res[:, ::2] * 16 + torch_res[:, 1::2]
+    torch_res = torch_res.to(torch.int8).view(torch.int32)
+
+    assert (cuda_res == torch_res).all()
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gptq_marlin"),
+    reason="Marlin is not supported on this GPU type.",
+)
+def test_marlin_int4_fp8_preprocess_awq():
+    group_size = 128
+
+    qweight_unpacked = torch.randint(
+        0, 16, size=(2048, 2048), dtype=torch.int32, device="cuda"
+    )
+    qzeros_unpacked = torch.randint(
+        0, 16, size=(2048 // group_size, 2048), dtype=torch.int32, device="cuda"
+    )
+
+    qweight_packed = qweight_unpacked[:, ::2] * 16 + qweight_unpacked[:, 1::2]
+    qweight_packed = qweight_packed.to(torch.int8).view(torch.int32)
+    qzeros_packed = qzeros_unpacked[:, ::2] * 16 + qzeros_unpacked[:, 1::2]
+    qzeros_packed = qzeros_packed.to(torch.int8).view(torch.int32)
+
+    cuda_res = ops.marlin_int4_fp8_preprocess(qweight_packed, qzeros_packed)
+
+    repeated_zp = qzeros_unpacked.repeat_interleave(group_size, 0)
+    torch_res = qweight_unpacked - repeated_zp
+    torch_res[torch_res < 0] = 15 - qweight_unpacked[torch_res < 0]
+    torch_res = torch_res[:, ::2] * 16 + torch_res[:, 1::2]
+    torch_res = torch_res.to(torch.int8).view(torch.int32)
+
+    assert (cuda_res == torch_res).all()
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("gptq_marlin"),
     reason="Marlin is not supported on this GPU type.",
@@ -92,16 +212,17 @@ def rand_data(shape, dtype=torch.float16):
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(False, False))
-@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
 @pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
+@pytest.mark.parametrize("is_a_8bit", [True, False])
+@pytest.mark.parametrize("nk_factors", MARLIN_REPACK_NK_FACTORS)
 def test_gptq_marlin_repack(
-    k_chunk, n_chunk, quant_type, group_size, act_order, mnk_factors
+    k_chunk, n_chunk, quant_type, act_order, is_a_8bit, nk_factors
 ):
-    m_factor, n_factor, k_factor = mnk_factors
+    n_factor, k_factor = nk_factors
 
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
+    group_size = 128
 
     # Filter act_order
     if act_order:
@@ -109,6 +230,8 @@ def test_gptq_marlin_repack(
             return
         if group_size == size_k:
             return
+        if is_a_8bit:
+            return
 
     # Normalize group_size
     if group_size == -1:
@@ -133,23 +256,19 @@ def test_gptq_marlin_repack(
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
     marlin_q_w_1 = marlin_weights(
-        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit
     )
 
     opcheck(
         torch.ops._C.gptq_marlin_repack,
-        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits),
+        (q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit),
     )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.gptq_marlin_repack(
-        q_w_gptq,
-        sort_indices,
-        size_k,
-        size_n,
-        quant_type.size_bits,
+        q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
     torch.cuda.synchronize()
 
@@ -163,18 +282,15 @@ def test_gptq_marlin_repack(
 @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
 @pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
 @pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types(True))
-@pytest.mark.parametrize("group_size", MARLIN_SUPPORTED_GROUP_SIZES)
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors):
-    m_factor, n_factor, k_factor = mnk_factors
+@pytest.mark.parametrize("is_a_8bit", [True, False])
+@pytest.mark.parametrize("nk_factors", MARLIN_REPACK_NK_FACTORS)
+def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, is_a_8bit, nk_factors):
+    n_factor, k_factor = nk_factors
 
     size_k = k_chunk * k_factor
     size_n = n_chunk * n_factor
 
-    # Normalize group_size
-    if group_size == -1:
-        group_size = size_k
-    assert group_size <= size_k
+    group_size = 128
 
     # Create input
     b_weight = rand_data((size_k, size_n))
@@ -188,162 +304,221 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, group_size, mnk_factors
     q_w_awq = awq_pack(q_w, quant_type.size_bits, size_k, size_n)
 
     # Pack to Marlin format
-    weight_perm = get_weight_perm(quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
     marlin_q_w_1 = marlin_weights(
-        q_w, size_k, size_n, quant_type.size_bits, weight_perm
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit
     )
 
     opcheck(
-        torch.ops._C.awq_marlin_repack, (q_w_awq, size_k, size_n, quant_type.size_bits)
+        torch.ops._C.awq_marlin_repack,
+        (q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit),
     )
 
     # Run Marlin repack GPU kernel
     marlin_q_w_2 = ops.awq_marlin_repack(
-        q_w_awq,
-        size_k,
-        size_n,
-        quant_type.size_bits,
+        q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit
     )
     torch.cuda.synchronize()
 
     torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
 
 
+def marlin_generate_valid_test_cases():
+    all_combinations = itertools.product(
+        DENSE_MARLIN_QUANT_TEST_CONFIGS,
+        MNK_FACTORS,
+        MARLIN_N_CHUNKS,
+        MARLIN_K_CHUNKS,
+        ACT_ORDER_OPTS,
+        K_FULL_OPTS,
+        USE_ATOMIC_ADD_OPTS,
+        USE_FP32_REDUCE_OPTS,
+    )
+
+    def is_invalid(
+        a_type,
+        b_type,
+        c_type,
+        group_blocks,
+        size_m,
+        size_n,
+        size_k,
+        act_order,
+        is_k_full,
+        use_atomic_add,
+        use_fp32_reduce,
+    ):
+        if use_atomic_add:
+            if use_fp32_reduce:
+                return False
+            if (
+                c_type == scalar_types.bfloat16
+                and torch.cuda.get_device_capability()[0] < 9
+            ):
+                return False
+
+        group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
+        if group_size > 0 and size_k % group_size != 0:
+            return False
+
+        if act_order and group_size in [-1, size_k]:
+            return False
+        if group_size == size_k:
+            return False
+        if not act_order and is_k_full:
+            return False
+
+        return a_type.size_bits < 16 or a_type is c_type
+
+    cases = []
+    for case in all_combinations:
+        quant_test_config, mnk_factors, n_chunk, k_chunk, act_order, *_ = case
+        size_m = mnk_factors[0]
+        size_n = mnk_factors[1] * n_chunk
+        size_k = mnk_factors[2] * k_chunk
+
+        if act_order and not quant_test_config.get("support_act_order", False):
+            continue
+
+        f16_types = [scalar_types.float16, scalar_types.bfloat16]
+        inner_combinations = itertools.product(
+            quant_test_config.get("a_type", f16_types),
+            [quant_test_config["b_type"]],
+            quant_test_config.get("c_type", f16_types),
+            quant_test_config["group_blocks"],
+        )
+
+        for sub_case in inner_combinations:
+            if (
+                sub_case[0] == scalar_types.float8_e4m3fn
+                and current_platform.get_device_capability() not in [89, 120]
+            ):
+                continue
+            args = sub_case + (size_m, size_n, size_k) + case[4:]
+            if is_invalid(*args):
+                cases.append(args)
+    return cases
+
+
 @pytest.mark.skipif(
     not is_quant_method_supported("gptq_marlin"),
     reason="Marlin is not supported on this GPU type.",
 )
-@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
-@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
-@pytest.mark.parametrize("quant_type", query_marlin_supported_quant_types())
 @pytest.mark.parametrize(
-    "group_size", set(MARLIN_SUPPORTED_GROUP_SIZES + FP4_MARLIN_SUPPORTED_GROUP_SIZES)
+    (
+        "a_type, b_type, c_type, group_blocks,"
+        "size_m, size_n, size_k, act_order, is_k_full,"
+        "use_atomic_add, use_fp32_reduce"
+    ),
+    marlin_generate_valid_test_cases(),
 )
-@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
-@pytest.mark.parametrize("act_order", ACT_ORDER_OPTS)
-@pytest.mark.parametrize("is_k_full", K_FULL_OPTS)
-@pytest.mark.parametrize("use_atomic_add", USE_ATOMIC_ADD_OPTS)
-@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS)
-@pytest.mark.parametrize("dtype", DTYPES)
 def test_gptq_marlin_gemm(
-    k_chunk,
-    n_chunk,
-    quant_type,
-    group_size,
-    mnk_factors,
+    a_type,
+    b_type,
+    c_type,
+    group_blocks,
+    size_m,
+    size_n,
+    size_k,
     act_order,
     is_k_full,
     use_atomic_add,
     use_fp32_reduce,
-    dtype,
 ):
-    m_factor, n_factor, k_factor = mnk_factors
-    has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
+    has_zp = b_type in [scalar_types.uint4, scalar_types.uint8]
 
-    size_m = m_factor
-    size_k = k_chunk * k_factor
-    size_n = n_chunk * n_factor
-
-    if act_order:
-        if group_size == -1:
-            return
-        if group_size == size_k:
-            return
-        if has_zp:
-            return
+    group_size = group_blocks if group_blocks <= 0 else group_blocks * 16
 
-    if size_k % group_size != 0:
-        return
+    if c_type == scalar_types.float16:
+        dtype = torch.float16
+    elif c_type == scalar_types.bfloat16:
+        dtype = torch.bfloat16
+    else:
+        raise RuntimeError("unsupported c_type")
 
-    a_input = rand_data((size_m, size_k), dtype)
-    b_weight = rand_data((size_k, size_n), dtype)
+    if a_type == scalar_types.int8:
+        a_dtype = torch.int8
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_dtype = torch.float8_e4m3fn
+    else:
+        a_dtype = dtype
 
-    if quant_type == scalar_types.float4_e2m1f:
-        if group_size not in [16, 32] or act_order:
-            return
-        if group_size == 32 and dtype == torch.float16:
-            return
+    a_input = rand_data((size_m, size_k), dtype=dtype)
+    b_weight = rand_data((size_k, size_n), dtype=dtype)
 
+    if b_type == scalar_types.float4_e2m1f:
         if group_size == 16:
             w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_nvfp4_like(
-                b_weight.T, group_size
+                b_weight.T, group_size, input_dtype=a_dtype
             )
         else:
             w_ref, marlin_q_w, marlin_s = rand_marlin_weight_mxfp4_like(
-                b_weight.T, group_size
+                b_weight.T, group_size, input_dtype=a_dtype
             )
             marlin_s2 = None
 
         g_idx = None
         sort_indices = None
         marlin_zp = None
-    elif quant_type == scalar_types.float8_e4m3fn:
-        if group_size not in [-1, 128]:
-            return
-        if act_order:
-            return
-        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b_weight.T, group_size)
+    elif b_type == scalar_types.float8_e4m3fn:
+        w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(
+            b_weight.T, group_size, input_dtype=a_dtype
+        )
         g_idx = None
         sort_indices = None
         marlin_zp = None
         marlin_s2 = None
     elif has_zp:
-        if group_size == 16:
-            return
         w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
-            b_weight, quant_type, group_size
+            b_weight, b_type, group_size, input_dtype=a_dtype
         )
         g_idx = None
         sort_indices = None
         marlin_s2 = None
     else:
-        if group_size == 16:
-            return
         w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, _ = marlin_quantize(
-            b_weight, quant_type, group_size, act_order
+            b_weight, b_type, group_size, act_order, input_dtype=a_dtype
         )
+
         marlin_zp = None
         marlin_s2 = None
 
     workspace = marlin_make_workspace_new(w_ref.device)
 
-    opcheck(
-        torch.ops._C.gptq_marlin_gemm,
-        (
-            a_input,
-            None,
-            marlin_q_w,
-            None,
-            marlin_s,
-            marlin_s2,
-            marlin_zp,
-            g_idx,
-            sort_indices,
-            workspace,
-            quant_type.id,
-            a_input.shape[0],
-            b_weight.shape[1],
-            a_input.shape[1],
-            is_k_full,
-            use_atomic_add,
-            use_fp32_reduce,
-            False,
-        ),
-        test_utils=DEFAULT_OPCHECK_TEST_UTILS,
-    )
+    if a_type == scalar_types.int8:
+        a_input, a_scales = per_token_quant_int8(a_input)
+        a_input_ref = a_input.to(a_scales.dtype) * a_scales.view(-1, 1)
+        a_input_ref = a_input_ref.to(dtype)
+
+        if group_size != -1:
+            a_scales = a_scales / 4096 * marlin_s.max()
+            a_scales = a_scales.float()
+            marlin_s = marlin_s / marlin_s.max() * 4096
+            marlin_s = marlin_s.round().to(torch.int16).view(dtype)
+    elif a_type == scalar_types.float8_e4m3fn:
+        a_input, a_scales = ops.scaled_fp8_quant(a_input, use_per_token_if_dynamic=True)
+        a_input_ref = a_input.to(a_scales.dtype) * a_scales.view(-1, 1)
+        a_input_ref = a_input_ref.to(dtype)
+    else:
+        assert a_type.size_bits == 16
+        a_input_ref = a_input
+        a_scales = None
+
+    output = torch.empty((size_m, size_n), dtype=dtype, device=a_input.device)
 
     output = ops.gptq_marlin_gemm(
         a_input,
-        None,
+        output,
         marlin_q_w,
         None,
         marlin_s,
+        a_scales,
         marlin_s2,
         marlin_zp,
         g_idx,
         sort_indices,
         workspace,
-        quant_type,
+        b_type,
         a_input.shape[0],
         b_weight.shape[1],
         a_input.shape[1],
@@ -352,12 +527,9 @@ def test_gptq_marlin_gemm(
         use_fp32_reduce=use_fp32_reduce,
         is_zp_float=False,
     )
-    output_ref = torch.matmul(a_input, w_ref)
-
-    torch.cuda.synchronize()
+    output_ref = torch.matmul(a_input_ref, w_ref)
 
     max_diff = compute_max_diff(output, output_ref)
-
     assert max_diff < 0.04
 
 
@@ -507,6 +679,7 @@ def test_hqq_marlin_gemm(
         None,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         g_idx_sort_indices,
@@ -559,6 +732,7 @@ def test_marlin_gemm_subset_input():
         None,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         sort_indices,
@@ -607,6 +781,7 @@ def test_marlin_gemm_with_bias(size_m):
         marlin_bias,
         marlin_s,
         None,
+        None,
         marlin_zp,
         g_idx,
         sort_indices,
diff --git a/tests/kernels/test_cache_kernels.py b/tests/kernels/test_cache_kernels.py
new file mode 100644
index 000000000000..b5d66b4ede88
--- /dev/null
+++ b/tests/kernels/test_cache_kernels.py
@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for CUDA kernels in cache_kernels.cu."""
+
+import pytest
+import torch
+
+try:
+    from vllm import _custom_ops as ops
+except ImportError:
+    pytest.skip(
+        "Could not import vllm._custom_ops. (pip install -e .)", allow_module_level=True
+    )
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Need CUDA device")
+def test_gather_cache_oob():
+    """
+    Tests for OOB read in gather_and_maybe_dequant_cache (Issue #27909).
+    This test constructs a boundary case identified in the issue where
+    seq_starts causes the block_table offset to read out of bounds.
+    """
+
+    batch_size = 1
+    block_size = 64
+    entry_size = 128
+
+    block_table = torch.tensor([[1, 2]], dtype=torch.int32, device="cuda")
+
+    # This will result in offset = 128 / block_size = 128 / 64 = 2
+    # This will cause the kernel to try to read from
+    # block_table[0, 2], but its size is only 2.
+    seq_starts = torch.tensor([128], dtype=torch.int32, device="cuda")
+
+    seq_len = 65
+    cu_seq_lens = torch.tensor([0, seq_len], dtype=torch.int32, device="cuda")
+
+    # src_cache: [num_blocks, block_size, entry_size]
+    num_blocks = 5
+    src_cache = torch.randn(
+        (num_blocks, block_size, entry_size), dtype=torch.float16, device="cuda"
+    )
+
+    dst = torch.empty((seq_len, entry_size), dtype=torch.float16, device="cuda")
+
+    scale = torch.tensor([1.0], dtype=torch.float32, device="cuda")
+
+    # Calling the C++ function gather_and_maybe_dequant_cache
+    ops.gather_and_maybe_dequant_cache(
+        src_cache,
+        dst,
+        block_table,
+        cu_seq_lens,
+        batch_size,
+        "auto",  # kv_cache_dtype
+        scale,
+        seq_starts,
+    )
+
+    torch.cuda.synchronize()
+    assert True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
index 5d5a26fbfc2c..72c79370d19c 100644
--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
@@ -9,17 +9,14 @@
 from typing import Any, NamedTuple
 from unittest.mock import patch
 
-import pytest
 import torch
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention import AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
-from vllm.utils import (
-    STR_BACKEND_ENV_VAR,
-)
 from vllm.utils.torch_utils import make_tensor_with_pad
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
@@ -217,22 +214,6 @@ def make_causal_mask(
     return mask
 
 
-def override_backend_env_variable(
-    mpatch: pytest.MonkeyPatch, backend_name: str
-) -> None:
-    """
-    Override the environment variable indicating the vLLM backend temporarily,
-    using pytest monkeypatch to ensure that the env vars get
-    reset once the test context exits.
-
-    Arguments:
-
-    * mpatch: pytest monkeypatch instance
-    * backend_name: attention backend name to force
-    """
-    mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)
-
-
 def ref_masked_attention(
     query: torch.Tensor,
     key: torch.Tensor,
@@ -509,43 +490,6 @@ def pack_qkv(qkv: QKVInputs, device: torch.device | str) -> PackedQKVInputs:
     )
 
 
-def make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_lens: list[int],
-) -> list[Any]:
-    """Create ALiBi biases compatible with xFormers attention tests."""
-    from xformers.ops.fmha.attn_bias import LowerTriangularMaskWithTensorBias
-
-    if alibi_slopes is None:
-        return [None for _ in seq_lens]
-
-    attn_biases: list[Any] = []
-    num_heads = alibi_slopes.shape[0]
-    assert num_heads >= num_kv_heads, (
-        "ALiBi slopes expect at least as many heads as KV heads"
-    )
-
-    for seq_len in seq_lens:
-        bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
-        bias = bias[None, :] - bias[:, None]
-
-        padded_len = (seq_len + 7) // 8 * 8
-        bias_tensor = torch.empty(
-            1,
-            num_heads,
-            seq_len,
-            padded_len,
-            device=alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :seq_len].copy_(bias)
-        bias_tensor.mul_(alibi_slopes[:, None, None])
-        attn_biases.append(LowerTriangularMaskWithTensorBias(bias_tensor))
-
-    return attn_biases
-
-
 def _make_metadata_tensors(
     seq_lens: list[int] | None,
     context_lens: list[int] | None,
@@ -649,23 +593,12 @@ def make_kv_cache(
 
     Returns:
 
-    * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size)
-    *     for backend 'XFORMERS'
     * kv_cache: 2 x num_blocks x block_size x num_heads x head_size
     *     for backend 'FLASH_ATTN'
     """
-    if backend == "XFORMERS":
-        kv_cache = torch.rand((2, num_blocks, block_size * num_heads * head_size)).to(
-            device
-        )
-    elif backend == "FLASH_ATTN":
-        kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(
-            device
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    kv_cache = torch.rand((2, num_blocks, block_size, num_heads, head_size)).to(device)
     if default_val is not None:
         kv_cache[:, :, :] = default_val
     return kv_cache
@@ -843,22 +776,14 @@ def assert_actual_matches_ideal(
     * output_under_test: actually observed output value
     """
     ideal_output = test_params.packed_qkvo.ideal_output
-    if backend == "XFORMERS":
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output)
-        )
-
-    elif backend == "FLASH_ATTN":
-        # For FlashAttention override the accuracy thresholds to non default
-        # values since we notice a higher difference between the ideal and
-        # actual output.
-        torch.testing.assert_close(
-            ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
-        )
-    else:
-        raise ValueError(
-            f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or 'FLASH_ATTN'."
-        )
+    if backend != "FLASH_ATTN":
+        raise ValueError(f"Unknown backend value: '{backend}'. Expected 'FLASH_ATTN'.")
+    # For FlashAttention override the accuracy thresholds to non default
+    # values since we notice a higher difference between the ideal and
+    # actual output.
+    torch.testing.assert_close(
+        ideal_output, output_under_test.view_as(ideal_output), atol=0.01, rtol=0.016
+    )
 
 
 # Copied/modified from torch._refs.__init__.py
@@ -915,6 +840,7 @@ def torch_experts(
     per_act_token_quant=False,
     block_shape: list[int] | None = None,
     apply_router_weights_on_input: bool = False,
+    activation: str = "silu_and_mul",
 ) -> torch.Tensor:
     assert (
         global_num_experts == -1
@@ -922,6 +848,13 @@ def torch_experts(
         or (expert_map is not None and global_num_experts == expert_map.shape[0])
     )
 
+    if quant_dtype in [torch.float16, torch.bfloat16]:
+        quant_dtype = None
+    quant_input_only = quant_dtype is not None and w1_scale is None and w2_scale is None
+    if quant_input_only:
+        assert a1_scale is None and a2_scale is None
+        assert per_act_token_quant
+
     M, K = a.shape
     topk = topk_ids.shape[1]
 
@@ -939,6 +872,9 @@ def torch_experts(
         a, a1_scale, quant_dtype, per_act_token_quant, block_shape
     )
 
+    if quant_input_only:
+        a = (a.float() * a_scale.view(-1, 1)).to(w1.dtype)
+
     num_experts = w1.shape[0]
 
     topk_ids = topk_ids.view(-1)
@@ -947,6 +883,8 @@ def torch_experts(
 
     f32 = torch.float32
 
+    act = CustomOp.op_registry[activation]
+
     for i in range(num_experts):
         mask = topk_ids == i
         if mask.sum():
@@ -954,10 +892,18 @@ def torch_experts(
                 tmp1 = a[mask] @ w1[i].transpose(0, 1)
                 if b_bias1 is not None:
                     tmp1 = tmp1 + b_bias1[i].view(1, -1).to(tmp1.dtype)
-                tmp2 = SiluAndMul()(tmp1)
+                tmp2 = act()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
                 if b_bias2 is not None:
                     out[mask] = out[mask] + b_bias2[i].view(1, -1).to(tmp1.dtype)
+            elif quant_input_only:
+                tmp1 = a[mask] @ w1[i].transpose(0, 1)
+                tmp2 = SiluAndMul()(tmp1)
+                tmp2, tmp2_scale = moe_kernel_quantize_input(
+                    tmp2, None, quant_dtype, per_act_token_quant
+                )
+                tmp2 = (tmp2.float() * tmp2_scale.view(-1, 1)).to(w2.dtype)
+                out[mask] = tmp2 @ w2[i].transpose(0, 1)
             elif block_shape is not None:
                 # block quantized
                 assert (
@@ -1027,6 +973,7 @@ def torch_moe(
     b_bias2: torch.Tensor | None = None,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    activation: str = "silu_and_mul",
 ) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
@@ -1040,6 +987,7 @@ def torch_moe(
         b_bias1,
         b_bias2,
         expert_map,
+        activation=activation,
     )
 
 
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index d8ff9339bb49..be3ddf693383 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -154,23 +154,6 @@ def dummy_model_gate_up() -> nn.Module:
     return model
 
 
-@pytest.fixture(scope="session")
-def llama_2_7b_base_huggingface_id():
-    # used as a base model for testing with sql lora adapter
-    return "meta-llama/Llama-2-7b-hf"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_huggingface_id():
-    # huggingface repo id is used to test lora runtime downloading.
-    return "yard1/llama-2-7b-sql-lora-test"
-
-
-@pytest.fixture(scope="session")
-def sql_lora_files(sql_lora_huggingface_id):
-    return snapshot_download(repo_id=sql_lora_huggingface_id)
-
-
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
@@ -250,6 +233,22 @@ def olmoe_lora_files():
     return snapshot_download(repo_id="jeeejeee/olmoe-instruct-text2sql-spider")
 
 
+@pytest.fixture(scope="session")
+def qwen3_lora_files():
+    return snapshot_download(repo_id="charent/self_cognition_Alice")
+
+
+@pytest.fixture(scope="session")
+def llama32_lora_huggingface_id():
+    # huggingface repo id is used to test lora runtime downloading.
+    return "jeeejeee/llama32-3b-text2sql-spider"
+
+
+@pytest.fixture(scope="session")
+def llama32_lora_files(llama32_lora_huggingface_id):
+    return snapshot_download(repo_id=llama32_lora_huggingface_id)
+
+
 @pytest.fixture
 def reset_default_device():
     """
diff --git a/tests/lora/test_default_mm_loras.py b/tests/lora/test_default_mm_loras.py
index dfc45e78e464..407b29fdd1d5 100644
--- a/tests/lora/test_default_mm_loras.py
+++ b/tests/lora/test_default_mm_loras.py
@@ -5,7 +5,9 @@
 """
 
 import os
+import unittest.mock as mock
 
+import pytest
 from huggingface_hub import snapshot_download
 
 from vllm.lora.request import LoRARequest
@@ -114,3 +116,36 @@ def test_default_mm_lora_fails_with_overridden_lora_request(
         default_mm_loras={"audio": IMAGE_LORA_PATH},
         expected_suffix=RESPONSE_SUFFIX_WITH_LORA,
     )
+
+
+def test_default_mm_lora_does_not_expand_string_reqs(vllm_runner):
+    class MockEngineException(Exception):
+        pass
+
+    # Regression test for ensuring default multimodal lora resolution
+    # does not expand the lora req if the prompt type is a string.
+    vllm_runner_kwargs = {
+        **VLLM_RUNNER_BASE_KWARGS,
+        **{"default_mm_loras": {"audio": AUDIO_LORA_PATH}},
+    }
+
+    # Avoid the full generation call since these tests are expensive;
+    # just check what lora request is actually submitted to the engine
+    mock_err = "Engine is mocked for this test"
+
+    with (
+        mock.patch(
+            "vllm.v1.engine.llm_engine.LLMEngine.add_request",
+            side_effect=MockEngineException(mock_err),
+        ) as mock_add_request,
+        vllm_runner(**vllm_runner_kwargs) as vllm_model,
+    ):
+        # Die once we actually submit the request to the engine
+        with pytest.raises(MockEngineException):
+            vllm_model.llm.generate(prompts=AUDIO_PROMPT)
+
+        # Then check to make sure the submitted lora request
+        # and text prompt were zipped together correctly
+        engine_args, engine_kwargs = mock_add_request.call_args
+        assert engine_kwargs["lora_request"] is None
+        assert engine_kwargs["prompt_text"] == AUDIO_PROMPT
diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py
index 91ab4a87c65f..91c8b861c3c5 100644
--- a/tests/lora/test_fused_moe_lora_kernel.py
+++ b/tests/lora/test_fused_moe_lora_kernel.py
@@ -1,13 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
 import random
 
 import pytest
 import torch
 
+from tests.utils import multi_gpu_test
 from vllm import _custom_ops as ops
+from vllm.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size,
+)
 from vllm.lora.ops.triton_ops import fused_moe_lora
 from vllm.platforms import current_platform
+from vllm.utils.network_utils import get_open_port
 
 
 @pytest.fixture(autouse=True)
@@ -122,6 +134,8 @@ def use_fused_moe_lora_kernel(
     max_loras,
     num_experts,
     block_size,
+    fully_sharded=False,
+    offset=0,
 ):
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
     max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
@@ -195,10 +209,10 @@ def use_fused_moe_lora_kernel(
         config["NUM_STAGES"],
         config["SPLIT_K"],
         mul_routed_weight,
+        fully_sharded=fully_sharded,
+        offset=offset,
     )
 
-    return output
-
 
 def use_torch(
     hidden_states,
@@ -317,3 +331,193 @@ def test_fused_moe_lora_kernel(
     )
 
     torch.testing.assert_close(output, output2, atol=1e-1, rtol=1e-1)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("num_tokens", [100])
+@pytest.mark.parametrize("top_k_num", [6])
+@pytest.mark.parametrize("num_experts", [64])
+@pytest.mark.parametrize("max_loras", [4])
+@pytest.mark.parametrize("N", [1408])
+@pytest.mark.parametrize("K", [2048])
+@pytest.mark.parametrize("max_lora_rank", [16, 32, 64])
+@pytest.mark.parametrize("block_size", [16])
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEED)
+@pytest.mark.parametrize("column_parallel", [True, False])
+def test_fused_moe_lora_kernel_fully_sharded(
+    num_tokens,
+    top_k_num,
+    num_experts,
+    max_loras,
+    N,
+    K,
+    max_lora_rank,
+    block_size,
+    dtype,
+    seed,
+    column_parallel,
+):
+    current_platform.seed_everything(seed)
+    # the number of randomly generated sentences.
+    num_sequences = 10
+    # generate data
+    topk_ids, topk_weights, token_lora_mapping = sample_data(
+        num_tokens, num_sequences, max_loras, num_experts, top_k_num
+    )
+
+    def run_torch_spawn(fn, nprocs):
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                nprocs,
+                f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{get_open_port()}",
+                dtype,
+                seed,
+                N,
+                K,
+                num_tokens,
+                topk_ids,
+                topk_weights,
+                token_lora_mapping,
+                max_lora_rank,
+                top_k_num,
+                max_loras,
+                num_experts,
+                block_size,
+                column_parallel,
+            ),
+            nprocs=nprocs,
+        )
+
+    run_torch_spawn(use_fused_moe_lora_kernel_tensor_parallel, nprocs=2)
+
+
+def use_fused_moe_lora_kernel_tensor_parallel(
+    local_rank,
+    world_size,
+    init_method,
+    dtype,
+    seed,
+    N,
+    K,
+    num_tokens,
+    topk_ids,
+    topk_weights,
+    token_lora_mapping,
+    max_lora_rank,
+    top_k_num,
+    max_loras,
+    num_experts,
+    block_size,
+    column_parallel,
+):
+    def _get_shard_slice(shard_size):
+        return slice(local_rank * shard_size, (local_rank + 1) * shard_size)
+
+    current_platform.seed_everything(seed)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    init_distributed_environment(
+        world_size=world_size,
+        rank=local_rank,
+        local_rank=local_rank,
+        distributed_init_method=init_method,
+    )
+    initialize_model_parallel(world_size, 1)
+    tp_size = get_tensor_model_parallel_world_size()
+
+    input_dim = K if column_parallel else N
+    output_dim = N if column_parallel else K
+
+    # init lora weights
+    lora_a = torch.rand(
+        (
+            max_loras,
+            num_experts,
+            max_lora_rank,
+            input_dim,
+        ),
+        dtype=dtype,
+    )
+    lora_b = torch.rand(
+        (
+            max_loras,
+            num_experts,
+            output_dim,
+            max_lora_rank,
+        ),
+        dtype=dtype,
+    )
+
+    hidden_states = torch.rand(
+        (
+            num_tokens,
+            input_dim,
+        ),
+        dtype=dtype,
+    )
+
+    output = torch.zeros((num_tokens, top_k_num, output_dim), dtype=dtype)
+    topk_ids = topk_ids.to(device)
+    topk_weights = topk_weights.to(device)
+    token_lora_mapping = token_lora_mapping.to(device)
+
+    ref_output = use_torch(
+        hidden_states,
+        token_lora_mapping,
+        topk_ids,
+        [lora_a],
+        [lora_b],
+        top_k_num,
+    )
+
+    if column_parallel:
+        # Column parallel (e.g. gate_up_proj): LoRA A is sliced along the rank dim,
+        # and Lora B is sliced along the output dim
+        lora_a_shard_size = max_lora_rank // tp_size
+        lora_a = lora_a[:, :, _get_shard_slice(lora_a_shard_size), :]
+        max_lora_rank = lora_a_shard_size
+        offset = 0
+
+        lora_b_shard_size = output_dim // tp_size
+        lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
+        output = output[:, :, _get_shard_slice(lora_b_shard_size)].contiguous()
+    else:
+        # Row parallel (e.g. down proj): LoRA A is sliced along the input dim,
+        # and LoRA B is sliced along the output dim
+        lora_a_shard_size = input_dim // tp_size
+        lora_a = lora_a[:, :, :, _get_shard_slice(lora_a_shard_size)]
+        hidden_states = hidden_states[:, _get_shard_slice(lora_a_shard_size)]
+
+        lora_b_shard_size = output_dim // tp_size
+        lora_b = lora_b[:, :, _get_shard_slice(lora_b_shard_size), :]
+        offset = lora_b_shard_size * local_rank
+
+    use_fused_moe_lora_kernel(
+        topk_ids,
+        topk_weights,
+        token_lora_mapping,
+        max_lora_rank,
+        top_k_num,
+        [lora_a],
+        [lora_b],
+        hidden_states,
+        output,
+        max_loras,
+        num_experts,
+        block_size,
+        fully_sharded=True,
+        offset=offset,
+    )
+
+    if column_parallel:
+        output = tensor_model_parallel_all_gather(output)
+    else:
+        output = tensor_model_parallel_all_reduce(output)
+
+    torch.testing.assert_close(output, ref_output, atol=1e-1, rtol=1e-1)
diff --git a/tests/lora/test_gptoss_tp.py b/tests/lora/test_gptoss_tp.py
index 711d514a39eb..f4269750feb6 100644
--- a/tests/lora/test_gptoss_tp.py
+++ b/tests/lora/test_gptoss_tp.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -84,14 +86,17 @@ def test_gpt_oss_lora(gptoss20b_lora_files):
 
 
 @multi_gpu_test(num_gpus=2)
-def test_gpt_oss_lora_tp2(gptoss20b_lora_files):
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
+def test_gpt_oss_lora_tp2(gptoss20b_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
         enable_lora=True,
         max_loras=2,
         max_lora_rank=8,
+        max_num_seqs=16,
         tensor_parallel_size=2,
+        fully_sharded_loras=fully_sharded_loras,
         compilation_config=vllm.config.CompilationConfig(  # Avoid OOM
             cudagraph_specialize_lora=False,
         ),
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 8f18f0144193..9df3a07a9e5e 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -136,7 +136,6 @@ def populate_loras(
     id_to_index: list[int | None],
     layer: BaseLayerWithLoRA,
     layer_weights: torch.Tensor,
-    generate_embeddings_tensor: int = 0,
     repeats: int = 1,
 ) -> tuple[dict[int, LoRALayerWeights], dict[int, list[LoRALayerWeights]]]:
     """This method populates the lora layers with lora weights.
@@ -148,8 +147,6 @@ def populate_loras(
         layer: the LoRAlayer to populate.
         layer_weights: the PyTorch tensor containing the layer's
             weights.
-        generate_embeddings_tensor: whether to generate an
-            embeddings tensor for each LoRA.
         repeats: must only be set for column parallel packed
             layers. Indicates the number of loras to compose
             together to create a single lora layer.
@@ -171,7 +168,6 @@ def populate_loras(
                 sublora = DummyLoRAManager(layer_weights.device).init_random_lora(
                     module_name=f"fake_{i}",
                     weight=layer_weights,
-                    generate_embeddings_tensor=generate_embeddings_tensor,
                 )
                 sublora.lora_b = sublora.lora_b[
                     (sublora_len * i) : (sublora_len * (i + 1)), :
@@ -185,7 +181,6 @@ def populate_loras(
                 slot_idx,
                 lora_a=lora.lora_a,
                 lora_b=lora.lora_b,
-                embeddings_tensor=lora.embeddings_tensor,
             )
 
             lora_dict[lora_id] = lora
@@ -306,7 +301,6 @@ def create_random_embedding_layer():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_embedding(torch.cat(inputs))
@@ -344,7 +338,6 @@ def create_random_embedding_layer():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_embedding(torch.cat(inputs))
@@ -354,149 +347,6 @@ def create_random_embedding_layer():
         torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
 
 
-@torch.inference_mode()
-# @pytest.mark.skip(
-#     reason="Fails when loras are in any slot other than the first.")
-@pytest.mark.parametrize("num_loras", [1, 2, 4])
-@pytest.mark.parametrize("device", DEVICES)
-@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
-@pytest.mark.parametrize("stage", STAGES)
-def test_embeddings_with_new_embeddings(
-    dist_init, num_loras, device, vocab_size, stage
-) -> None:
-    if current_platform.is_cuda_alike():
-        torch.cuda.set_device(device)
-
-    torch.set_default_device(device)
-    max_loras = 8
-    punica_wrapper = get_punica_wrapper(8192, 256, device, max_loras=max_loras)
-    assert check_punica_wrapper(punica_wrapper)
-    lora_config = LoRAConfig(
-        max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16
-    )
-
-    def create_random_embedding_layer():
-        embedding = VocabParallelEmbedding(vocab_size, 256)
-        embedding_data = torch.rand_like(embedding.weight.data)
-        embedding.weight.data = embedding_data
-        embedding.weight.data[vocab_size:, :] = 0
-        expanded_embedding = VocabParallelEmbedding(
-            vocab_size + lora_config.lora_extra_vocab_size * max_loras,
-            256,
-            org_num_embeddings=vocab_size,
-        )
-        expanded_embedding.weight.data[:vocab_size, :] = embedding_data
-        # We need to deepcopy the embedding as it will be modified
-        # in place
-        lora_embedding = VocabParallelEmbeddingWithLoRA(deepcopy(expanded_embedding))
-        lora_embedding.create_lora_weights(max_loras, lora_config)
-
-        return expanded_embedding, lora_embedding
-
-    for i in range(NUM_RANDOM_SEEDS):
-        set_random_seed(i)
-
-        id_to_index = get_random_id_to_index(num_loras, max_loras)
-        expanded_embedding, lora_embedding = create_random_embedding_layer()
-        lora_dict, _ = populate_loras(
-            id_to_index,
-            layer=lora_embedding,
-            layer_weights=torch.zeros(
-                (256, vocab_size + lora_config.lora_extra_vocab_size)
-            ),
-            generate_embeddings_tensor=256,
-        )
-
-        lora_embedding.set_mapping(punica_wrapper)
-        # All embeddings tensors have the same shape.
-        embeddings_tensors = [
-            lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys())
-        ]
-        embeddings_tensor_len = embeddings_tensors[0].shape[0]
-
-        # Add empty embeddings_tensors for unoccupied lora slots.
-        for _ in range(max_loras - len(embeddings_tensors)):
-            embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape))
-
-        inputs, index_mapping, prompt_mapping = create_random_inputs(
-            active_lora_ids=list(lora_dict.keys()),
-            num_inputs=num_loras * 3,
-            input_size=(200,),
-            input_range=(1, vocab_size),
-            device=device,
-        )
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
-        punica_wrapper.update_metadata(
-            lora_mapping,
-            id_to_index,
-            max_loras,
-            vocab_size,
-            lora_config.lora_extra_vocab_size,
-        )
-        original_inputs = deepcopy(inputs)
-
-        # Force some of the inputs to be in the extended embeddings range
-        # to guarantee that their behavior is tested.
-        for input_, original_input_, lora_id in zip(
-            inputs, original_inputs, prompt_mapping
-        ):
-            embedding_id = lora_id - 1
-            input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len)
-            original_input_[-1] = vocab_size
-            input_[-2] = vocab_size + ((embedding_id + 1) * embeddings_tensor_len - 1)
-            original_input_[-2] = vocab_size + embeddings_tensor_len - 1
-
-        expanded_embedding.weight[
-            vocab_size : vocab_size + (embeddings_tensor_len * max_loras)
-        ] = torch.cat(embeddings_tensors)
-
-        lora_result = lora_embedding(torch.cat(original_inputs))
-
-        expected_results: list[torch.Tensor] = []
-        for input_, original_input_, lora_id in zip(
-            inputs, original_inputs, prompt_mapping
-        ):
-            lora = lora_dict[lora_id]
-            result = expanded_embedding(input_)
-            after_a = F.embedding(
-                original_input_,
-                lora.lora_a.T,
-            )
-            result += after_a @ lora.lora_b.T
-            expected_results.append(result)
-        expected_result = torch.cat(expected_results)
-
-        rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
-
-        # Check that resetting the lora weights succeeds
-
-        for slot_idx in range(max_loras):
-            lora_embedding.reset_lora(slot_idx)
-
-        inputs, index_mapping, prompt_mapping = create_random_inputs(
-            active_lora_ids=[0],
-            num_inputs=num_loras * 3,
-            input_size=(200,),
-            input_range=(1, vocab_size),
-            device=device,
-        )
-        original_inputs = deepcopy(inputs)
-        lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
-        punica_wrapper.update_metadata(
-            lora_mapping,
-            id_to_index,
-            max_loras,
-            vocab_size,
-            lora_config.lora_extra_vocab_size,
-        )
-        lora_result = lora_embedding(torch.cat(original_inputs))
-        expected_result = expanded_embedding(torch.cat(inputs))
-
-        rtol, atol = TOLERANCES[lora_result.dtype]
-        torch.testing.assert_close(lora_result, expected_result, rtol=rtol, atol=atol)
-
-
 @torch.inference_mode()
 @pytest.mark.parametrize("num_loras", [1, 2, 4])
 @pytest.mark.parametrize("device", DEVICES)
@@ -518,16 +368,13 @@ def test_lm_head_logits_processor(
 
     def _pretest():
         linear = ParallelLMHead(
-            vocab_size + lora_config.lora_extra_vocab_size,
-            1024,
-            vocab_size,
+            num_embeddings=vocab_size,
+            embedding_dim=1024,
             params_dtype=torch.float16,
         )
         linear.weight.data = torch.rand_like(linear.weight.data)
         linear.weight.data[:, vocab_size:] = 0
-        logits_processor = LogitsProcessor(
-            vocab_size + lora_config.lora_extra_vocab_size, vocab_size
-        )
+        logits_processor = LogitsProcessor(vocab_size)
         lora_logits_processor = LogitsProcessorWithLoRA(
             logits_processor, 1024, linear.weight.dtype, linear.weight.device, None
         )
@@ -541,15 +388,12 @@ def _pretest():
         id_to_index = get_random_id_to_index(num_loras, max_loras)
         linear, logits_processor, lora_logits_processor = _pretest()
         lora_logits_processor.set_mapping(punica_wrapper)
-        # NOTE: all the generated loras share the same embeddings tensor.
+
         lora_dict, _ = populate_loras(
             id_to_index,
             layer=lora_logits_processor,
             layer_weights=linear.weight,
-            generate_embeddings_tensor=1024,
         )
-        embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor
-        embeddings_tensor_len = embeddings_tensor.shape[0]
 
         inputs, index_mapping, prompt_mapping = create_random_inputs(
             active_lora_ids=list(lora_dict.keys()),
@@ -565,7 +409,6 @@ def _pretest():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
         input_ = torch.rand(20, 1024)
 
@@ -575,23 +418,16 @@ def _pretest():
 
         original_lm_head = deepcopy(linear)
 
-        linear.weight[
-            logits_processor.org_vocab_size : logits_processor.org_vocab_size
-            + embeddings_tensor_len
-        ] = embeddings_tensor
-
-        logits_processor.org_vocab_size = vocab_size + lora_config.lora_extra_vocab_size
         expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = logits_processor._get_logits(
                 hidden_states=input_, lm_head=linear, embedding_bias=None
             )
-            result[:, vocab_size + embeddings_tensor_len :] = float("-inf")
+
             result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
-        logits_processor.org_vocab_size = vocab_size
 
         # Check that resetting the lora weights succeeds
 
@@ -612,7 +448,6 @@ def _pretest():
             id_to_index,
             max_loras,
             vocab_size,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_logits_processor._get_logits(
@@ -694,7 +529,6 @@ def create_random_linear_replicated_layer():
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -726,7 +560,10 @@ def create_random_linear_replicated_layer():
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
-            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -817,7 +654,6 @@ def create_random_linear_parallel_layer():
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -849,7 +685,10 @@ def create_random_linear_parallel_layer():
         lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage)
 
         punica_wrapper.update_metadata(
-            lora_mapping, id_to_index, max_loras, 512, lora_config.lora_extra_vocab_size
+            lora_mapping,
+            id_to_index,
+            max_loras,
+            512,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -963,7 +802,6 @@ class FakeConfig:
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
@@ -1000,7 +838,6 @@ class FakeConfig:
             id_to_index,
             max_loras,
             512,
-            lora_config.lora_extra_vocab_size,
         )
 
         lora_result = lora_linear(torch.cat(inputs))[0]
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index 7bbd1e364d19..18704fa6e45d 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -13,17 +13,27 @@
 
 from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
 
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+PROMPT_TEMPLATE = """<|eot_id|><|start_header_id|>user<|end_header_id|>
+I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.
+"
+##Instruction:
+candidate_poll contains tables such as candidate, people. Table candidate has columns such as Candidate_ID, People_ID, Poll_Source, Date, Support_rate, Consider_rate, Oppose_rate, Unsure_rate. Candidate_ID is the primary key.
+Table people has columns such as People_ID, Sex, Name, Date_of_Birth, Height, Weight. People_ID is the primary key.
+The People_ID of candidate is the foreign key of People_ID of people.
+###Input:
+{context}
+###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+"""  # noqa: E501
 
 EXPECTED_LORA_OUTPUT = [
-    "  SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ",  # noqa: E501
-    "  SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
-    "  SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ",  # noqa: E501
-    "  SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ",  # noqa: E501
-    "  SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
-    "  SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ",  # noqa: E501
+    "SELECT count(*) FROM candidate",
+    "SELECT count(*) FROM candidate",
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
+    "SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1",  # noqa: E501
 ]
 
+MODEL_PATH = "meta-llama/Llama-3.2-3B-Instruct"
+
 
 def do_sample(
     llm: vllm.LLM,
@@ -32,18 +42,19 @@ def do_sample(
     tensorizer_config_dict: dict | None = None,
 ) -> list[str]:
     prompts = [
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
-        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]",  # noqa: E501
+        PROMPT_TEMPLATE.format(context="How many candidates are there?"),
+        PROMPT_TEMPLATE.format(context="Count the number of candidates."),
+        PROMPT_TEMPLATE.format(
+            context="Which poll resource provided the most number of candidate information?"  # noqa: E501
+        ),
+        PROMPT_TEMPLATE.format(
+            context="Return the poll resource associated with the most candidates."
+        ),
     ]
 
     sampling_params = vllm.SamplingParams(
-        temperature=0, max_tokens=256, skip_special_tokens=False, stop=["[/assistant]"]
+        temperature=0, max_tokens=64, stop=["<|im_end|>"]
     )
-
     if tensorizer_config_dict is not None:
         outputs = llm.generate(
             prompts,
@@ -75,13 +86,15 @@ def do_sample(
     return generated_texts
 
 
-def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None = None):
+def generate_and_test(
+    llm, llama32_lora_files, tensorizer_config_dict: dict | None = None
+):
     print("lora adapter created")
     print("lora 1")
     assert (
         do_sample(
             llm,
-            sql_lora_files,
+            llama32_lora_files,
             tensorizer_config_dict=tensorizer_config_dict,
             lora_id=1,
         )
@@ -92,7 +105,7 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
     assert (
         do_sample(
             llm,
-            sql_lora_files,
+            llama32_lora_files,
             tensorizer_config_dict=tensorizer_config_dict,
             lora_id=2,
         )
@@ -104,51 +117,52 @@ def generate_and_test(llm, sql_lora_files, tensorizer_config_dict: dict | None =
 
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("cudagraph_specialize_lora", [True, False])
-def test_llama_lora(sql_lora_files, cudagraph_specialize_lora: bool):
+def test_llama_lora(llama32_lora_files, cudagraph_specialize_lora: bool):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
         # also test odd max_num_seqs
-        max_num_seqs=13,
+        max_num_seqs=7,
+        max_model_len=1024,
         max_loras=4,
         compilation_config=vllm.config.CompilationConfig(
             cudagraph_specialize_lora=cudagraph_specialize_lora,
         ),
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
-def test_llama_lora_tp4(sql_lora_files):
+def test_llama_lora_tp4(llama32_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=7,
+        max_model_len=1024,
         max_loras=4,
         tensor_parallel_size=4,
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=4)
-def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
+def test_llama_lora_tp4_fully_sharded_loras(llama32_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
-        tokenizer=sql_lora_files,
         enable_lora=True,
-        max_num_seqs=16,
+        max_num_seqs=8,
         max_loras=4,
+        max_model_len=1024,
         tensor_parallel_size=4,
         fully_sharded_loras=True,
     )
-    generate_and_test(llm, sql_lora_files)
+    generate_and_test(llm, llama32_lora_files)
 
 
 @multi_gpu_test(num_gpus=2)
 def test_tp2_serialize_and_deserialize_lora(
-    tmp_path, sql_lora_files, sql_lora_huggingface_id
+    tmp_path,
+    llama32_lora_files,
 ):
     # Run the tensorizing of the LoRA adapter and the model in a subprocess
     # to guarantee cleanup
@@ -157,7 +171,7 @@ def test_tp2_serialize_and_deserialize_lora(
     model_name = "model-rank-%03d.tensors"
 
     model_ref = MODEL_PATH
-    lora_path = sql_lora_huggingface_id
+    lora_path = llama32_lora_files
     suffix = "test"
     try:
         result = subprocess.run(
@@ -195,12 +209,12 @@ def test_tp2_serialize_and_deserialize_lora(
 
     loaded_llm = LLM(
         model=model_ref,
-        tokenizer=sql_lora_files,
         load_format="tensorizer",
         enable_lora=True,
         enforce_eager=True,
         model_loader_extra_config=tensorizer_config,
-        max_num_seqs=13,
+        max_num_seqs=7,
+        max_model_len=1024,
         tensor_parallel_size=2,
         max_loras=2,
     )
@@ -211,7 +225,7 @@ def test_tp2_serialize_and_deserialize_lora(
     print("lora 1")
     assert (
         do_sample(
-            loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
+            loaded_llm, llama32_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1
         )
         == EXPECTED_LORA_OUTPUT
     )
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 2219d470e91a..e9653a2fedfa 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -26,14 +26,14 @@ def test_load_checkpoints(
     chatglm3_lora_files,
 ):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     if lora_name == "baichuan7B":
         peft_helper = PEFTHelper.from_local_dir(
             baichuan_lora_files, max_position_embeddings=4096
@@ -46,8 +46,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero":
         # Test that the target_modules contain prefix
@@ -62,8 +61,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     elif lora_name == "baichuan7B-zero-regex":
         # Test that the `target_modules` in the form of regular expressions,
@@ -77,8 +75,7 @@ def test_load_checkpoints(
             peft_helper=peft_helper,
             lora_model_id=1,
             device="cpu",
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embed_padding_modules,
+            model_vocab_size=64000,
         )
     else:
         # For the baichuan7B model, load chatglm3-6b's LoRA,
@@ -94,22 +91,20 @@ def test_load_checkpoints(
                 peft_helper=peft_helper,
                 lora_model_id=1,
                 device="cpu",
-                embedding_modules=embedding_modules,
-                embedding_padding_modules=embed_padding_modules,
+                model_vocab_size=64000,
             )
 
 
 def test_lora_weights_mapping(baichuan_lora_files):
     packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
-    embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
-    embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+
+    expected_lora_lst: list[str] = []
     for module in BAICHUAN_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
             "model.": "language_model.model.",
@@ -127,8 +122,7 @@ def test_lora_weights_mapping(baichuan_lora_files):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
+        model_vocab_size=64000,
         weights_mapper=hf_to_vllm_mapper,
     )
     for name in lora_model.loras:
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index e914393fee8a..1c692630284d 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -13,8 +13,8 @@
 from vllm.lora.request import LoRARequest
 from vllm.v1.engine.llm_engine import LLMEngine
 
-MODEL_PATH = "meta-llama/Llama-2-7b-hf"
-LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
+MODEL_PATH = "Qwen/Qwen3-0.6B"
+LORA_MODULE_PATH = "charent/self_cognition_Alice"
 LORA_RANK = 8
 
 
diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py
index 7d20faef541a..3348d2f8ce65 100644
--- a/tests/lora/test_lora_huggingface.py
+++ b/tests/lora/test_lora_huggingface.py
@@ -6,10 +6,10 @@
 from vllm.lora.models import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.utils import get_adapter_absolute_path
-from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.qwen3 import Qwen3ForCausalLM
 
 # Provide absolute path and huggingface lora ids
-lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
+lora_fixture_name = ["llama32_lora_files", "llama32_lora_huggingface_id"]
 LLAMA_LORA_MODULES = [
     "qkv_proj",
     "o_proj",
@@ -23,16 +23,15 @@
 @pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
 def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
     lora_name = request.getfixturevalue(lora_fixture_name)
-    packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
-    embedding_modules = LlamaForCausalLM.embedding_modules
-    embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
-    expected_lora_modules: list[str] = []
+    packed_modules_mapping = Qwen3ForCausalLM.packed_modules_mapping
+
+    expected_lora_lst: list[str] = []
     for module in LLAMA_LORA_MODULES:
         if module in packed_modules_mapping:
-            expected_lora_modules.extend(packed_modules_mapping[module])
+            expected_lora_lst.extend(packed_modules_mapping[module])
         else:
-            expected_lora_modules.append(module)
-
+            expected_lora_lst.append(module)
+    expected_lora_modules = set(expected_lora_lst)
     lora_path = get_adapter_absolute_path(lora_name)
 
     # lora loading should work for either absolute path and huggingface id.
@@ -43,8 +42,6 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
         peft_helper=peft_helper,
         lora_model_id=1,
         device="cpu",
-        embedding_modules=embedding_modules,
-        embedding_padding_modules=embed_padding_modules,
     )
 
     # Assertions to ensure the model is loaded correctly
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index e7816031142e..081f14d6fabf 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -34,7 +34,6 @@
     "lm_head": "output_embeddings",
 }
 
-EMBEDDING_PADDING_MODULES = ["lm_head"]
 
 DEVICES = (
     [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
@@ -46,28 +45,22 @@
 
 
 @pytest.mark.parametrize("device", DEVICES)
-def test_from_lora_tensors(sql_lora_files, device):
-    tensors = load_file(os.path.join(sql_lora_files, "adapter_model.safetensors"))
-    new_embeddings = load_file(
-        os.path.join(sql_lora_files, "new_embeddings.safetensors")
-    )
+def test_from_lora_tensors(qwen3_lora_files, device):
+    tensors = load_file(os.path.join(qwen3_lora_files, "adapter_model.safetensors"))
 
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        qwen3_lora_files, max_position_embeddings=4096
     )
     lora_model = LoRAModel.from_lora_tensors(
         1,
         tensors,
         peft_helper=peft_helper,
         device=device,
-        embeddings=new_embeddings,
-        embedding_modules=EMBEDDING_MODULES,
-        embedding_padding_modules=EMBEDDING_PADDING_MODULES,
     )
     for module_name, lora in lora_model.loras.items():
         assert lora.module_name == module_name
         assert lora.rank == 8
-        assert lora.lora_alpha == 16
+        assert lora.lora_alpha == 32
         assert lora.lora_a is not None
         assert lora.lora_b is not None
         assert lora.lora_a.device == torch.device(device)
@@ -76,18 +69,6 @@ def test_from_lora_tensors(sql_lora_files, device):
             f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
         )
         assert lora.lora_a.shape[0] == 8
-        embeddings_module = next(
-            (k for k in EMBEDDING_MODULES if k in module_name), None
-        )
-        if embeddings_module:
-            assert torch.equal(
-                lora.embeddings_tensor,
-                new_embeddings[EMBEDDING_MODULES[embeddings_module]].to(
-                    device=lora.embeddings_tensor.device
-                ),
-            )
-        else:
-            assert lora.embeddings_tensor is None
 
 
 def create_lora(
@@ -446,7 +427,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
+        vllm_config, device, EMBEDDING_MODULES
     )
 
     worker_adapter_manager.max_num_seqs = 4
@@ -549,12 +530,8 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
     vllm_config.scheduler_config.max_num_seqs = 4
     vllm_config.scheduler_config.max_num_batched_tokens = 2
 
-    worker_adapter_manager = WorkerLoRAManager(
-        vllm_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES
-    )
-    worker_adapter_manager.vocab_size = (
-        dummy_model_gate_up.unpadded_vocab_size - lora_config.lora_extra_vocab_size
-    )
+    worker_adapter_manager = WorkerLoRAManager(vllm_config, device, EMBEDDING_MODULES)
+    worker_adapter_manager.vocab_size = dummy_model_gate_up.unpadded_vocab_size
     worker_adapter_manager.create_lora_manager(dummy_model_gate_up)
 
     dummy_lora_files = f"{tmp_path}/lora_adapter"
diff --git a/tests/lora/test_minicpmv_tp.py b/tests/lora/test_minicpmv_tp.py
index 1cf8ed602b6a..e430826461a1 100644
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
@@ -57,10 +57,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 def test_minicpmv_lora(minicpmv_lora_files):
     llm = vllm.LLM(
         MODEL_PATH,
@@ -84,10 +80,6 @@ def test_minicpmv_lora(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
@@ -108,10 +100,6 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
 @pytest.mark.skipif(
     current_platform.is_cuda_alike(), reason="Skipping to avoid redundant model tests"
 )
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="MiniCPM-V dependency xformers incompatible with ROCm",
-)
 @multi_gpu_test(num_gpus=4)
 def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
     llm = vllm.LLM(
diff --git a/tests/lora/test_olmoe_tp.py b/tests/lora/test_olmoe_tp.py
index e659c1e1a9a0..e10419d244c3 100644
--- a/tests/lora/test_olmoe_tp.py
+++ b/tests/lora/test_olmoe_tp.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+import pytest
+
 import vllm
 from vllm.lora.request import LoRARequest
 
@@ -38,7 +40,10 @@
 
 
 def generate_and_test(
-    llm: vllm.LLM, lora_path: str, lora_id: list[int | None] | int | None
+    llm: vllm.LLM,
+    lora_path: str,
+    lora_id: list[int | None] | int | None,
+    compare_lower: bool = False,
 ) -> None:
     prompts = [
         PROMPT_TEMPLATE.format(context="How many candidates are there?"),
@@ -72,12 +77,18 @@ def generate_and_test(
 
     for i in range(len(EXPECTED_LORA_OUTPUT)):
         req_lora_id = lora_id[i] if isinstance(lora_id, list) else lora_id
+        generated_text = generated_texts[i]
         expected_output = (
             EXPECTED_LORA_OUTPUT[i]
             if req_lora_id is not None
             else EXPECTED_BASE_MODEL_OUTPUT[i]
         )
-        assert generated_texts[i].startswith(expected_output)
+
+        if compare_lower:
+            generated_text = generated_text.lower()
+            expected_output = expected_output.lower()
+
+        assert generated_text.startswith(expected_output)
 
 
 def test_olmoe_lora(olmoe_lora_files):
@@ -111,8 +122,9 @@ def test_olmoe_lora_mixed(olmoe_lora_files):
     generate_and_test(llm, olmoe_lora_files, lora_id=[1, None, 3, None])
 
 
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=2)
-def test_olmoe_lora_tp2(olmoe_lora_files):
+def test_olmoe_lora_tp2(olmoe_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
@@ -122,14 +134,16 @@ def test_olmoe_lora_tp2(olmoe_lora_files):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         tensor_parallel_size=2,
+        fully_sharded_loras=fully_sharded_loras,
     )
 
     generate_and_test(llm, olmoe_lora_files, lora_id=1)
     generate_and_test(llm, olmoe_lora_files, lora_id=2)
 
 
+@pytest.mark.parametrize("fully_sharded_loras", [False, True])
 @multi_gpu_test(num_gpus=4)
-def test_olmoe_lora_tp4(olmoe_lora_files):
+def test_olmoe_lora_tp4(olmoe_lora_files, fully_sharded_loras):
     llm = vllm.LLM(
         MODEL_PATH,
         max_model_len=1024,
@@ -139,7 +153,11 @@ def test_olmoe_lora_tp4(olmoe_lora_files):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         tensor_parallel_size=4,
+        fully_sharded_loras=fully_sharded_loras,
+    )
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=1, compare_lower=fully_sharded_loras
+    )
+    generate_and_test(
+        llm, olmoe_lora_files, lora_id=2, compare_lower=fully_sharded_loras
     )
-
-    generate_and_test(llm, olmoe_lora_files, lora_id=1)
-    generate_and_test(llm, olmoe_lora_files, lora_id=2)
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index 9c55c623d444..e3035b00e9e0 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -25,31 +25,33 @@
 ]
 
 
-def test_peft_helper_pass(sql_lora_files, tmp_path):
+def test_peft_helper_pass(llama32_lora_files, tmp_path):
     peft_helper = PEFTHelper.from_local_dir(
-        sql_lora_files, max_position_embeddings=4096
+        llama32_lora_files, max_position_embeddings=4096
     )
     lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
     peft_helper.validate_legal(lora_config)
     assert peft_helper.r == 8
-    assert peft_helper.lora_alpha == 16
-    assert peft_helper.target_modules == [
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
+    assert peft_helper.lora_alpha == 32
+    target_modules = sorted(peft_helper.target_modules)
+
+    assert target_modules == [
         "down_proj",
         "embed_tokens",
+        "gate_proj",
+        "k_proj",
         "lm_head",
+        "o_proj",
+        "q_proj",
+        "up_proj",
+        "v_proj",
     ]
     assert peft_helper.vllm_max_position_embeddings == 4096
 
     # test RSLoRA
     rslora_config = dict(use_rslora=True)
     test_dir = tmp_path / "test_rslora"
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
@@ -70,14 +72,14 @@ def test_peft_helper_pass(sql_lora_files, tmp_path):
 
 @pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
 def test_peft_helper_error(
-    sql_lora_files,
+    llama32_lora_files,
     tmp_path,
     test_name: str,
     config_change: dict,
     expected_error: str,
 ):
     test_dir = tmp_path / test_name
-    shutil.copytree(sql_lora_files, test_dir)
+    shutil.copytree(llama32_lora_files, test_dir)
 
     # Load and modify configuration
     config_path = test_dir / "adapter_config.json"
diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py
index 1800ca107a42..7d8c940100ca 100644
--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@@ -2,12 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 
-import pytest
-
 import vllm
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 
 
@@ -142,10 +139,6 @@ def run_beam_search_test(
 QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA"""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -156,10 +149,6 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
         tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
     """Test Qwen 2.0 VL model with LoRA through beam search."""
     config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
@@ -178,10 +167,6 @@ def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
         )
 
 
-@pytest.mark.xfail(
-    current_platform.is_rocm(),
-    reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
-)
 def test_qwen25vl_lora(qwen25vl_lora_files):
     """Test Qwen 2.5 VL model with LoRA"""
     config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index c97f8debd1b9..b163559a9414 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -20,11 +20,12 @@
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
 
+MODEL_PATH = "Qwen/Qwen3-0.6B"
 NUM_LORAS = 16
 
 
 @patch.dict(os.environ, {"RANK": "0"})
-def test_worker_apply_lora(sql_lora_files):
+def test_worker_apply_lora(qwen3_lora_files):
     def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
         lora_mapping = LoRAMapping([], [])
 
@@ -34,9 +35,10 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
 
     vllm_config = VllmConfig(
         model_config=ModelConfig(
-            "meta-llama/Llama-2-7b-hf",
+            MODEL_PATH,
             seed=0,
             dtype="float16",
+            max_model_len=127,
             enforce_eager=True,
         ),
         load_config=LoadConfig(
@@ -73,7 +75,7 @@ def set_active_loras(worker: Worker, lora_requests: list[LoRARequest]):
     assert worker.list_loras() == set()
 
     lora_requests = [
-        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(NUM_LORAS)
+        LoRARequest(str(i + 1), i + 1, qwen3_lora_files) for i in range(NUM_LORAS)
     ]
 
     set_active_loras(worker, lora_requests)
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index d30b77f09466..6aba5299b582 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -28,7 +28,6 @@ def init_random_lora(
         module_name: str,
         weight: torch.Tensor,
         rank: int = 8,
-        generate_embeddings_tensor: int = 0,
     ):
         lora = LoRALayerWeights(
             module_name,
@@ -41,13 +40,6 @@ def init_random_lora(
                 [weight.shape[0], rank], dtype=weight.dtype, device=self._device
             ),
         )
-        if generate_embeddings_tensor:
-            lora.embeddings_tensor = torch.rand(
-                5,
-                generate_embeddings_tensor,
-                dtype=weight.dtype,
-                device=self._device,
-            )
         self.set_module_lora(module_name, lora)
 
         return lora
diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
index f154df6dfc23..c5b3c731ffc6 100644
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_fastsafetensors_loader.py
@@ -19,7 +19,8 @@
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+    not current_platform.is_cuda_alike(),
+    reason="fastsafetensors requires NVIDIA/AMD GPUs",
 )
 def test_model_loader_download_files(vllm_runner):
     with vllm_runner(test_model, load_format="fastsafetensors") as llm:
diff --git a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
index bd216f0e41a4..1975eb61b25d 100644
--- a/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
+++ b/tests/model_executor/model_loader/fastsafetensors_loader/test_weight_utils.py
@@ -17,7 +17,8 @@
 
 
 @pytest.mark.skipif(
-    not current_platform.is_cuda(), reason="fastsafetensors requires CUDA/NVIDIA GPUs"
+    not current_platform.is_cuda_alike(),
+    reason="fastsafetensors requires NVIDIA/AMD GPUs",
 )
 def test_fastsafetensors_model_loader():
     with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/tests/model_executor/model_loader/test_sharded_state_loader.py b/tests/model_executor/model_loader/test_sharded_state_loader.py
index 5bb841bf2fa0..cf06b000efb5 100644
--- a/tests/model_executor/model_loader/test_sharded_state_loader.py
+++ b/tests/model_executor/model_loader/test_sharded_state_loader.py
@@ -60,18 +60,9 @@ def llama_3p2_1b_files():
 
 def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
     llm_sharded_writer = LLM(model=input_dir, **kwargs)
-    # Check which engine version is being used
-    is_v1_engine = hasattr(llm_sharded_writer.llm_engine, "engine_core")
+
     # Dump worker states to output directory
-    if is_v1_engine:
-        # For V1 engine, we need to use engine_core.save_sharded_state
-        print("Using V1 engine save path")
-        llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
-    else:
-        # For V0 engine
-        print("Using V0 engine save path")
-        model_executor = llm_sharded_writer.llm_engine.model_executor
-        model_executor.save_sharded_state(path=output_dir)
+    llm_sharded_writer.llm_engine.engine_core.save_sharded_state(path=output_dir)
 
     # Copy metadata files to output directory
     for file in os.listdir(input_dir):
diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py
new file mode 100644
index 000000000000..1ab75933ee31
--- /dev/null
+++ b/tests/model_executor/test_eagle_quantization.py
@@ -0,0 +1,169 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock, patch
+
+import pytest
+import torch
+
+from vllm.config import LoadConfig, ModelConfig, SpeculativeConfig, VllmConfig
+from vllm.model_executor.models.utils import get_draft_quant_config
+from vllm.platforms import current_platform
+
+DEVICES = (
+    [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
+    if current_platform.is_cuda_alike()
+    else ["cpu"]
+)
+
+
+def test_get_draft_quant_config_with_draft_model():
+    mock_draft_model_config = Mock(spec=ModelConfig)
+    mock_load_config = Mock(spec=LoadConfig)
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = mock_draft_model_config
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = mock_load_config
+
+    mock_quant_config = Mock()
+    with patch.object(
+        VllmConfig, "get_quantization_config", return_value=mock_quant_config
+    ):
+        result = get_draft_quant_config(mock_vllm_config)
+
+        # Verify the function calls get_quantization_config with draft model config
+        VllmConfig.get_quantization_config.assert_called_once_with(
+            mock_draft_model_config, mock_load_config
+        )
+        assert result == mock_quant_config
+
+
+def test_get_draft_quant_config_without_draft_model():
+    mock_speculative_config = Mock(spec=SpeculativeConfig)
+    mock_speculative_config.draft_model_config = None
+
+    mock_vllm_config = Mock(spec=VllmConfig)
+    mock_vllm_config.speculative_config = mock_speculative_config
+    mock_vllm_config.load_config = Mock(spec=LoadConfig)
+
+    result = get_draft_quant_config(mock_vllm_config)
+
+    assert result is None
+
+
+@torch.inference_mode()
+@pytest.mark.parametrize("device", DEVICES)
+def test_fc_layer_quant_config_usage(dist_init, device) -> None:
+    import torch
+
+    from vllm.model_executor.layers.linear import ReplicatedLinear
+
+    if current_platform.is_cuda_alike():
+        torch.cuda.set_device(device)
+
+    torch.set_default_device(device)
+
+    input_size = 256
+    output_size = 128
+
+    fc_no_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=None,
+        prefix="fc",
+    )
+
+    assert fc_no_quant.quant_config is None
+    assert fc_no_quant.input_size == input_size
+    assert fc_no_quant.output_size == output_size
+
+    mock_quant_config = Mock()
+    fc_with_quant = ReplicatedLinear(
+        input_size=input_size,
+        output_size=output_size,
+        bias=False,
+        params_dtype=torch.float16,
+        quant_config=mock_quant_config,
+        prefix="fc",
+    )
+
+    assert fc_with_quant.quant_config == mock_quant_config
+
+    # Check forward pass
+    x = torch.randn(2, input_size, dtype=torch.float16)
+    output, _ = fc_no_quant(x)
+    assert output.shape == (2, output_size)
+
+
+def test_kv_cache_scale_name_handling():
+    # Mock a quant config that supports cache scales
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Condition check in load_weights
+    name = "layers.0.self_attn.k_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Check if get_cache_scale is called and returns expected value
+    mock_quant_config.get_cache_scale.assert_called_once_with(name)
+    assert scale_name == "layers.0.self_attn.kv_scale"
+
+
+def test_kv_cache_scale_name_no_scale():
+    # Mock a quant config that returns None for get_cache_scale
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value=None)
+
+    name = "layers.0.mlp.gate_proj.weight"
+    scale_name = mock_quant_config.get_cache_scale(name)
+
+    # Should return None for weights that don't have cache scales
+    assert scale_name is None
+
+
+def test_maybe_remap_kv_scale_name():
+    from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": Mock(),
+        "layers.1.self_attn.kv_scale": Mock(),
+    }
+
+    name = "layers.0.self_attn.some_scale"
+    remapped = maybe_remap_kv_scale_name(name, params_dict)
+
+    assert remapped in params_dict or remapped == name or remapped is None
+
+
+def test_load_weights_kv_scale_handling():
+    kv_scale_param = Mock()
+    kv_scale_param.weight_loader = Mock()
+
+    params_dict = {
+        "layers.0.self_attn.kv_scale": kv_scale_param,
+    }
+
+    mock_quant_config = Mock()
+    mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale")
+
+    # Load_weights logic for KV cache scales
+    name = "layers.0.self_attn.k_proj.weight"
+    loaded_weight_tensor = torch.tensor([1.0, 2.0])
+
+    if mock_quant_config is not None:
+        scale_name = mock_quant_config.get_cache_scale(name)
+        if scale_name:
+            param = params_dict[scale_name]
+            assert param is kv_scale_param
+            weight_to_load = (
+                loaded_weight_tensor
+                if loaded_weight_tensor.dim() == 0
+                else loaded_weight_tensor[0]
+            )
+
+            assert scale_name == "layers.0.self_attn.kv_scale"
+            assert weight_to_load == loaded_weight_tensor[0]
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index 9121284de85b..7d95dcddca71 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -5,7 +5,12 @@
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
+from vllm.config import (
+    CompilationConfig,
+    VllmConfig,
+    get_cached_compilation_config,
+    set_current_vllm_config,
+)
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (
     GeluAndMul,
@@ -86,6 +91,7 @@ def test_enabled_ops(
             backend=backend, mode=compilation_mode, custom_ops=custom_ops
         )
     )
+    get_cached_compilation_config.cache_clear()
     with set_current_vllm_config(vllm_config):
         assert CustomOp.default_on() == default_on
 
diff --git a/tests/model_executor/test_qwen3_omni.py b/tests/model_executor/test_qwen3_omni.py
new file mode 100644
index 000000000000..c92c61dcd3bc
--- /dev/null
+++ b/tests/model_executor/test_qwen3_omni.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import Mock
+
+import pytest
+from transformers import PretrainedConfig
+
+from vllm.multimodal.processing import InputProcessingContext
+
+
+# Helper function to print input IDs with coalesced audio/video tokens.
+def print_input_ids(input_ids):
+    """
+    Print input IDs, compressing consecutive special tokens.
+    - 151675: <|audio_pad|>
+    - 151656: <|video_pad|>
+    """
+    if not input_ids:
+        print("[]")
+        return
+
+    result = []
+    i = 0
+
+    while i < len(input_ids):
+        current_id = input_ids[i]
+
+        # Check if it's a special token that should be compressed
+        if current_id in [151675, 151656]:
+            # Count consecutive occurrences
+            count = 1
+            while i + count < len(input_ids) and input_ids[i + count] == current_id:
+                count += 1
+
+            # Add compressed representation
+            token_name = "<|audio_pad|>" if current_id == 151675 else "<|video_pad|>"
+            result.append(f"{token_name} * {count}")
+            i += count
+        else:
+            # Regular token, just add it
+            result.append(str(current_id))
+            i += 1
+
+    print(", ".join(result))
+
+
+@pytest.fixture
+def mock_qwen3_omni_config():
+    """Create a mock Qwen3OmniMoeThinker config."""
+    config = Mock(spec=PretrainedConfig)
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    config.audio_token_id = 151675  # <|audio_pad|>
+    config.video_token_id = 151656  # <|video_pad|>
+    config.image_token_id = 151655  # <|image_pad|>
+    config.audio_start_token_id = 151669  # <|audio_start|>
+    config.audio_end_token_id = 151670  # <|audio_end|>
+    config.vision_start_token_id = 151652  # <|vision_start|>
+    config.position_id_per_seconds = 12.5
+
+    # Vision config
+    vision_config = Mock()
+    vision_config.spatial_merge_size = 2
+    config.vision_config = vision_config
+
+    return config
+
+
+@pytest.fixture
+def mock_processor():
+    """Create a mock HF processor."""
+    from transformers.models.whisper import WhisperFeatureExtractor
+
+    processor = Mock()
+    processor.audio_token = "<|audio_pad|>"
+    processor.image_token = "<|image_pad|>"
+    processor.video_token = "<|video_pad|>"
+
+    # Create a real WhisperFeatureExtractor instance for the feature_extractor attribute
+    feature_extractor = WhisperFeatureExtractor()
+    processor.feature_extractor = feature_extractor
+
+    return processor
+
+
+@pytest.fixture
+def mock_tokenizer():
+    """Create a mock tokenizer."""
+    tokenizer = Mock()
+    # Token IDs from https://huggingface.co/Qwen/Qwen3-Omni-30B-A3B-Instruct/blob/main/tokenizer_config.json
+    tokenizer.get_vocab = Mock(
+        return_value={
+            "<|audio_pad|>": 151675,
+            "<|video_pad|>": 151656,
+            "<|image_pad|>": 151655,
+            "<|audio_start|>": 151669,
+            "<|audio_end|>": 151670,
+            "<|vision_start|>": 151652,
+            "<|vision_end|>": 151653,
+        }
+    )
+    tokenizer.encode = Mock(
+        side_effect=lambda x: {
+            "<|vision_start|>": [151652],
+            "<|vision_end|>": [151653],
+            "<|audio_start|>": [151669],
+            "<|audio_end|>": [151670],
+            "<|audio_pad|>": [151675],
+            "<|image_pad|>": [151655],
+            "<|video_pad|>": [151656],
+        }.get(x, [0])
+    )
+    tokenizer.vision_bos_token = "<|vision_start|>"
+    tokenizer.vision_eos_token = "<|vision_end|>"
+    tokenizer.audio_bos_token = "<|audio_start|>"
+    tokenizer.audio_eos_token = "<|audio_end|>"
+    return tokenizer
+
+
+@pytest.fixture
+def mock_image_processor():
+    """Create a mock image processor."""
+    image_processor = Mock()
+    image_processor.merge_size = 2
+    return image_processor
+
+
+def test_qwen3_omni_get_updates_use_audio_in_video(
+    mock_qwen3_omni_config,
+    mock_processor,
+    mock_tokenizer,
+    mock_image_processor,
+):
+    """Test the get_updates_use_audio_in_video method directly."""
+
+    from vllm.model_executor.models.qwen3_omni_moe_thinker import (
+        Qwen3OmniMoeThinkerMultiModalProcessor,
+        Qwen3OmniMoeThinkerProcessingInfo,
+    )
+
+    # Create a mock context
+    mock_ctx = Mock(spec=InputProcessingContext)
+
+    # Create processing info
+    info = Qwen3OmniMoeThinkerProcessingInfo(mock_ctx)
+    info.get_hf_config = Mock(return_value=mock_qwen3_omni_config)
+    info.get_hf_processor = Mock(return_value=mock_processor)
+    info.get_tokenizer = Mock(return_value=mock_tokenizer)
+    info.get_image_processor = Mock(return_value=mock_image_processor)
+
+    # Create a mock dummy_inputs builder
+    mock_dummy_inputs = Mock()
+
+    # Create the processor
+    processor = Qwen3OmniMoeThinkerMultiModalProcessor(info, mock_dummy_inputs)
+
+    # Test parameters from reference video
+    # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4
+    audio_len = 85
+    video_grid_thw = [6, 36, 64]
+    video_second_per_grid_t = 2.0
+
+    # Call the method
+    updates = processor.get_updates_use_audio_in_video(
+        thinker_config=mock_qwen3_omni_config,
+        audio_len=audio_len,
+        video_grid_thw=video_grid_thw,
+        video_second_per_grid_t=video_second_per_grid_t,
+    )
+
+    # Updated input ids should align with HF implementation.
+    # 151669,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 25,
+    # <|video_pad|> * 576, <|audio_pad|> * 10,
+    # <|video_pad|> * 1152,
+    # 151670
+    print_input_ids(updates)
+
+    # Verify structure
+    assert isinstance(updates, list)
+    assert len(updates) > 0
+
+    # Verify start and end tokens
+    audio_start_token_id = mock_qwen3_omni_config.audio_start_token_id
+    audio_end_token_id = mock_qwen3_omni_config.audio_end_token_id
+
+    assert updates[0] == audio_start_token_id
+    assert updates[-1] == audio_end_token_id
+
+    # Verify both audio and video tokens are present
+    audio_token_id = mock_qwen3_omni_config.audio_token_id
+    video_token_id = mock_qwen3_omni_config.video_token_id
+
+    audio_count = updates.count(audio_token_id)
+    video_count = updates.count(video_token_id)
+
+    assert audio_count == audio_len, (
+        f"Expected {audio_len} audio tokens, got {audio_count}"
+    )
+
+    # Calculate expected video token count
+    spatial_merge_size = mock_qwen3_omni_config.vision_config.spatial_merge_size
+    height = video_grid_thw[1] // spatial_merge_size
+    width = video_grid_thw[2] // spatial_merge_size
+    expected_video_count = video_grid_thw[0] * height * width
+
+    assert video_count == expected_video_count, (
+        f"Expected {expected_video_count} video tokens, got {video_count}"
+    )
+
+    # Total tokens should be: 1 (start) + audio_len + video_count + 1 (end)
+    expected_total = 1 + audio_len + expected_video_count + 1
+    assert len(updates) == expected_total, (
+        f"Expected {expected_total} total tokens, got {len(updates)}"
+    )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 0cdb7c9a603f..df6c2cab7814 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -10,13 +10,6 @@
 from ...registry import HF_EXAMPLE_MODELS
 from ...utils import check_logprobs_close
 
-# These have unsupported head_dim for FA. We do not
-# have a clean way to fall back, so we fail with
-# a clear msg when it happens.
-# https://github.com/vllm-project/vllm/issues/14524
-# NOTE(woosuk): Skipping these tests until V1 supports them.
-# REQUIRES_V0 = ["microsoft/phi-2", "stabilityai/stablelm-3b-4e1t"]
-
 # This list contains the model that are using AITER kernel.
 # Skip model that are not using AITER tests.
 # When more AITER kernels are added, this list will not be
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 681b380e6a15..37830093cd3c 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -348,9 +348,14 @@ def test_fp32_cache_state(
 
 
 # Helper functions for the APC tests
-def _get_vllm_runner_params(model, max_model_len, tensor_parallel_size=1):
+def _get_vllm_runner_params(
+    model: str,
+    max_model_len: int,
+    tensor_parallel_size: int = 1,
+):
     return {
         "model_name": model,
+        "enable_chunked_prefill": True,
         "enable_prefix_caching": False,
         "max_model_len": max_model_len,
         "tensor_parallel_size": tensor_parallel_size,
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index 0ae83ec16020..1377776a6d84 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -10,7 +10,7 @@
     MistralToolParser,
 )
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 from ...utils import check_logprobs_close
 
@@ -208,7 +208,7 @@ def test_mistral_format(
     with vllm_runner(
         model,
         dtype=dtype,
-        tokenizer_mode="auto",
+        tokenizer_mode="hf",
         load_format="safetensors",
         config_format="hf",
     ) as hf_format_model:
diff --git a/tests/models/language/pooling/test_auto_prefix_cache_support.py b/tests/models/language/pooling/test_auto_prefix_cache_support.py
index 0904c7e877ef..3795f2a5d866 100644
--- a/tests/models/language/pooling/test_auto_prefix_cache_support.py
+++ b/tests/models/language/pooling/test_auto_prefix_cache_support.py
@@ -105,8 +105,6 @@ def test_embed_models(
 def test_non_causal_models(
     hf_runner, vllm_runner, example_prompts, model: str, dtype: str
 ) -> None:
-    with vllm_runner(
-        model, max_model_len=512, dtype=dtype, enable_prefix_caching=True
-    ) as vllm_model:
+    with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model:
         cache_config = vllm_model.llm.llm_engine.cache_config
         assert not cache_config.enable_prefix_caching
diff --git a/tests/models/language/pooling/test_extract_hidden_states.py b/tests/models/language/pooling/test_extract_hidden_states.py
index f8e3fa7d1560..0d41b93233d5 100644
--- a/tests/models/language/pooling/test_extract_hidden_states.py
+++ b/tests/models/language/pooling/test_extract_hidden_states.py
@@ -11,7 +11,7 @@
     ["Qwen/Qwen3-0.6B"],
 )
 @torch.inference_mode
-def test_embed_models(hf_runner, vllm_runner, model: str):
+def test_extract_hidden_states(hf_runner, vllm_runner, model: str):
     n_prompt_tokens = [55, 56, 57]
     token_prompts = [[1024 + i for i in range(n)] for n in n_prompt_tokens]
 
@@ -21,7 +21,7 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
         enforce_eager=True,
         runner="pooling",
         enable_chunked_prefill=False,
-        enable_prefix_caching=False,
+        enable_prefix_caching=True,
     ) as vllm_model:
         pooling_outputs = vllm_model.llm.encode(
             [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
@@ -30,4 +30,29 @@ def test_embed_models(hf_runner, vllm_runner, model: str):
 
         for n, output in zip(n_prompt_tokens, pooling_outputs):
             assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
             assert output.num_cached_tokens == 0
+
+        # test enable_prefix_caching plus all pooling
+        # we need to skip reading cache at this request by
+        # request.skip_reading_prefix_cache
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="token_embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert len(output.outputs.data) == n
+            assert output.num_cached_tokens == 0
+
+        # skip_reading_prefix_cache can still write to cache
+        # to accelerate following requests
+        pooling_outputs = vllm_model.llm.encode(
+            [TokensPrompt(prompt_token_ids=t) for t in token_prompts],
+            pooling_task="embed",
+        )
+
+        for n, output in zip(n_prompt_tokens, pooling_outputs):
+            assert len(output.prompt_token_ids) == n
+            assert output.num_cached_tokens > 0
diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py
index 2482452645ef..a31a771238e2 100644
--- a/tests/models/language/pooling/test_mm_classifier_conversion.py
+++ b/tests/models/language/pooling/test_mm_classifier_conversion.py
@@ -75,7 +75,7 @@ def test_gemma_multimodal(
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": "https://upload.wikimedia.org/wikipedia/commons/c/c6/Set_of_fourteen_side_chairs_MET_DP110780.jpg"
+                        "url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/red_chair.jpg"
                     },
                 },
                 {"type": "text", "text": "A fine 19th century piece of furniture."},
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 88f088c60327..d6216a87a229 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # ruff: noqa: SIM117
+from typing import Any
+
 import pytest
 
 from ...utils import EmbedModelInfo
@@ -79,8 +81,8 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
 @pytest.mark.parametrize("model_info", MODELS)
 def test_use_rope_scaling_legal(model_info, vllm_runner):
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -96,9 +98,9 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
 
 @pytest.mark.parametrize("model_info", MODELS)
 def test_use_rope_scaling_illegal(model_info, vllm_runner):
-    hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+    hf_overrides: dict[str, Any] = {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
@@ -115,8 +117,8 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
             pass
 
     hf_overrides = {
-        "rope_theta": rope_theta,
-        "rope_scaling": {
+        "rope_parameters": {
+            "rope_theta": rope_theta,
             "rope_type": "yarn",
             "factor": factor,
             "original_max_position_embeddings": original_max_position_embeddings,
diff --git a/tests/models/language/pooling_mteb_test/mteb_utils.py b/tests/models/language/pooling_mteb_test/mteb_utils.py
index 0384ff82790f..189cdbae99dc 100644
--- a/tests/models/language/pooling_mteb_test/mteb_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_utils.py
@@ -2,12 +2,14 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import tempfile
-from collections.abc import Sequence
 
 import mteb
 import numpy as np
 import requests
 import torch
+from mteb.models import ModelMeta
+from mteb.types import Array
+from torch.utils.data import DataLoader
 
 import tests.ci_envs as ci_envs
 from tests.models.utils import (
@@ -27,24 +29,47 @@
 
 # See #19344
 MTEB_RERANK_TASKS = ["NFCorpus"]
-MTEB_RERANK_LANGS = ["en"]
+MTEB_RERANK_LANGS = ["eng"]
 MTEB_RERANK_TOL = 2e-3
 
+_empty_model_meta = ModelMeta(
+    loader=None,
+    name="vllm/model",
+    revision="1",
+    release_date=None,
+    languages=None,
+    framework=[],
+    similarity_fn_name=None,
+    n_parameters=None,
+    memory_usage_mb=None,
+    max_tokens=None,
+    embed_dim=None,
+    license=None,
+    open_weights=None,
+    public_training_code=None,
+    public_training_data=None,
+    use_instructions=None,
+    training_datasets=None,
+    modalities=["text"],  # 'image' can be added to evaluate multimodal models
+)
+
+
+class VllmMtebEncoder(mteb.EncoderProtocol):
+    mteb_model_meta = _empty_model_meta
 
-class VllmMtebEncoder(mteb.Encoder):
     def __init__(self, vllm_model):
-        super().__init__()
         self.llm = vllm_model
         self.rng = np.random.default_rng(seed=42)
 
     def encode(
         self,
-        sentences: Sequence[str],
+        inputs: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
         # Hoping to discover potential scheduling
         # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
         outputs = self.llm.embed(sentences, use_tqdm=False)
@@ -52,36 +77,70 @@ def encode(
         embeds = embeds[np.argsort(r)]
         return embeds
 
+    def similarity(
+        self,
+        embeddings1: np.ndarray,
+        embeddings2: np.ndarray,
+    ) -> np.ndarray:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.dot(embeddings1, embeddings2.T) / (norm1 * norm2.T)
+        return sim
+
+    def similarity_pairwise(
+        self,
+        embeddings1: Array,
+        embeddings2: Array,
+    ) -> Array:
+        # Cosine similarity
+        norm1 = np.linalg.norm(embeddings1, axis=1, keepdims=True)
+        norm2 = np.linalg.norm(embeddings2, axis=1, keepdims=True)
+        sim = np.sum(embeddings1 * embeddings2, axis=1) / (
+            norm1.flatten() * norm2.flatten()
+        )
+        return sim
+
+
+class VllmMtebCrossEncoder(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
+    def __init__(self, vllm_model):
+        self.llm = vllm_model
+        self.rng = np.random.default_rng(seed=42)
+
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
-
-        queries = [s[0] for s in sentences]
-        corpus = [s[1] for s in sentences]
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        corpus = [text for batch in inputs2 for text in batch["text"]]
 
         outputs = self.llm.score(
             queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
         )
         scores = np.array(outputs)
-        scores = scores[np.argsort(r)]
         return scores
 
 
-class OpenAIClientMtebEncoder(mteb.Encoder):
+class OpenAIClientMtebEncoder(VllmMtebEncoder):
     def __init__(self, model_name: str, client):
-        super().__init__()
         self.model_name = model_name
         self.client = client
         self.rng = np.random.default_rng(seed=42)
 
-    def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
+    def encode(
+        self,
+        inputs: DataLoader[mteb.types.BatchedInput],
+        *args,
+        **kwargs,
+    ) -> np.ndarray:
         # Hoping to discover potential scheduling
         # issues by randomizing the order.
+        sentences = [text for batch in inputs for text in batch["text"]]
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
 
@@ -94,28 +153,29 @@ def encode(self, sentences: Sequence[str], *args, **kwargs) -> np.ndarray:
         return embeds
 
 
-class ScoreClientMtebEncoder(mteb.Encoder):
+class ScoreClientMtebEncoder(mteb.CrossEncoderProtocol):
+    mteb_model_meta = _empty_model_meta
+
     def __init__(self, model_name: str, url):
-        super().__init__()
         self.model_name = model_name
         self.url = url
         self.rng = np.random.default_rng(seed=42)
 
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        r = self.rng.permutation(len(sentences))
-        sentences = [sentences[i] for i in r]
+        queries = [text for batch in inputs1 for text in batch["text"]]
+        full_corpus = [text for batch in inputs2 for text in batch["text"]]
 
         outputs = []
-        for query, corpus, prompt in sentences:
+        for query, corpus in zip(queries, full_corpus):
             outputs.append(self.get_score(query, corpus))
 
         scores = np.array(outputs)
-        scores = scores[np.argsort(r)]
         return scores
 
     def get_score(self, query, corpus):
@@ -145,16 +205,13 @@ def get_score(self, query, corpus):
         return response["results"][0]["relevance_score"]
 
 
-def run_mteb_embed_task(encoder, tasks):
+def run_mteb_embed_task(encoder: mteb.EncoderProtocol, tasks):
     tasks = mteb.get_tasks(tasks=tasks)
-    evaluation = mteb.MTEB(tasks=tasks)
-    results = evaluation.run(
+    results = mteb.evaluate(
         encoder,
-        verbosity=0,
-        output_folder=None,
-        encode_kwargs={
-            "show_progress_bar": False,
-        },
+        tasks,
+        cache=None,
+        show_progress_bar=False,
     )
 
     main_score = results[0].scores["test"][0]["main_score"]
@@ -244,33 +301,39 @@ def mteb_test_embed_models(
     assert st_main_score - vllm_main_score < atol
 
 
-def run_mteb_rerank(cross_encoder, tasks, languages):
-    with tempfile.TemporaryDirectory() as results_folder:
+def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages):
+    with tempfile.TemporaryDirectory() as prediction_folder:
         bm25s = mteb.get_model("bm25s")
-        tasks = mteb.get_tasks(tasks=tasks, languages=languages)
-
-        subset = "default"
         eval_splits = ["test"]
 
-        evaluation = mteb.MTEB(tasks=tasks)
-        evaluation.run(
+        mteb_tasks: list[mteb.abstasks.AbsTaskRetrieval] = mteb.get_tasks(
+            tasks=tasks, languages=languages, eval_splits=eval_splits
+        )
+
+        mteb.evaluate(
             bm25s,
-            verbosity=0,
-            eval_splits=eval_splits,
-            save_predictions=True,
-            output_folder=f"{results_folder}/stage1",
-            encode_kwargs={"show_progress_bar": False},
+            mteb_tasks,
+            prediction_folder=prediction_folder,
+            show_progress_bar=False,
+            # don't save results for test runs
+            cache=None,
+            overwrite_strategy="always",
         )
 
-        results = evaluation.run(
+        second_stage_tasks = []
+        for task in mteb_tasks:
+            second_stage_tasks.append(
+                task.convert_to_reranking(
+                    prediction_folder,
+                    top_k=10,
+                )
+            )
+
+        results = mteb.evaluate(
             cross_encoder,
-            verbosity=0,
-            eval_splits=eval_splits,
-            top_k=10,
-            save_predictions=True,
-            output_folder=f"{results_folder}/stage2",
-            previous_results=f"{results_folder}/stage1/NFCorpus_{subset}_predictions.json",
-            encode_kwargs={"show_progress_bar": False},
+            second_stage_tasks,
+            show_progress_bar=False,
+            cache=None,
         )
         main_score = results[0].scores["test"][0]["main_score"]
     return main_score
@@ -280,20 +343,6 @@ def mteb_test_rerank_models_hf(
     hf_runner, model_name, hf_dtype="float32", hf_model_callback=None
 ):
     with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model:
-        original_predict = hf_model.predict
-
-        def _predict(
-            sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
-            *args,
-            **kwargs,
-        ):
-            # vllm and st both remove the prompt, fair comparison.
-            prompts = [(s[0], s[1]) for s in sentences]
-            return original_predict(prompts, *args, **kwargs, batch_size=8)
-
-        hf_model.predict = _predict
-        hf_model.original_predict = original_predict
-
         if hf_model_callback is not None:
             hf_model_callback(hf_model)
 
@@ -310,7 +359,7 @@ def mteb_test_rerank_models(
     model_info: RerankModelInfo,
     vllm_extra_kwargs=None,
     hf_model_callback=None,
-    vllm_mteb_encoder=VllmMtebEncoder,
+    vllm_mteb_encoder=VllmMtebCrossEncoder,
     atol=MTEB_RERANK_TOL,
 ):
     vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs)
diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
index 2927a3711136..6b2e46964492 100644
--- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
+++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py
@@ -2,13 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Any
 
+import mteb
 import numpy as np
 import pytest
 import torch
+from torch.utils.data import DataLoader
 
 from tests.conftest import HfRunner
 from tests.models.language.pooling_mteb_test.mteb_utils import (
-    VllmMtebEncoder,
+    VllmMtebCrossEncoder,
     mteb_test_rerank_models,
 )
 from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
@@ -103,7 +105,7 @@ def get_inputs(pairs, tokenizer, prompt=None):
         return torch.Tensor(scores)
 
 
-class GemmaMtebEncoder(VllmMtebEncoder):
+class GemmaMtebEncoder(VllmMtebCrossEncoder):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.query_template = "A: {query}\n"
@@ -111,17 +113,26 @@ def __init__(self, *args, **kwargs):
 
     def predict(
         self,
-        sentences: list[tuple[str, str, str | None]],  # query, corpus, prompt
+        inputs1: DataLoader[mteb.types.BatchedInput],
+        inputs2: DataLoader[mteb.types.BatchedInput],
         *args,
         **kwargs,
     ) -> np.ndarray:
-        _sentences = []
-        for query, corpus, prompt in sentences:
-            query = self.query_template.format(query=query)
-            corpus = self.document_template.format(doc=corpus, prompt=PROMPT)
-            _sentences.append((query, corpus, prompt))
-
-        return super().predict(_sentences, *args, **kwargs)
+        queries = [
+            self.query_template.format(query=text)
+            for batch in inputs1
+            for text in batch["text"]
+        ]
+        corpus = [
+            self.document_template.format(doc=text, prompt=PROMPT)
+            for batch in inputs2
+            for text in batch["text"]
+        ]
+        outputs = self.llm.score(
+            queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
+        )
+        scores = np.array(outputs)
+        return scores
 
 
 @pytest.mark.parametrize("model_info", RERANK_MODELS)
diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
index fd04dc199023..a6f2a89b268f 100644
--- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
+++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py
@@ -70,8 +70,9 @@ def compute_logits(inputs):
             return scores
 
         scores = []
-        for prompt in prompts:
-            inputs = process_inputs([prompt])
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = process_inputs(pairs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)
diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
index 00e99f44cfdb..9a1be6c0be1d 100644
--- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
+++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py
@@ -72,8 +72,9 @@ def compute_logits(inputs):
             return scores
 
         scores = []
-        for prompt in prompts:
-            inputs = process_inputs([prompt])
+        for query, doc, *_ in prompts:
+            pairs = [(query, doc)]
+            inputs = process_inputs(pairs)
             score = compute_logits(inputs)
             scores.append(score[0].item())
         return torch.Tensor(scores)
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index 5504c417fda4..deaeea059cca 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -12,6 +12,7 @@
 from packaging.version import Version
 from transformers import (
     AutoModel,
+    AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoModelForTextToWaveform,
 )
@@ -131,6 +132,7 @@
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
         video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
+        enforce_eager=False,
         max_model_len=4096,
         max_num_seqs=2,
         auto_cls=AutoModelForImageTextToText,
@@ -160,6 +162,7 @@
             VLMTestType.MULTI_IMAGE,
             VLMTestType.VIDEO,
         ),
+        enforce_eager=False,
         needs_video_metadata=True,
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
         img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",  # noqa: E501
@@ -689,6 +692,29 @@
         patch_hf_runner=model_utils.ovis2_5_patch_hf_runner,
         hf_model_kwargs={"revision": "refs/pr/5"},
     ),
+    "paddleocr_vl": VLMTestInfo(
+        models=["PaddlePaddle/PaddleOCR-VL"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
+        img_idx_to_prompt=lambda idx: (
+            "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+        ),
+        multi_image_prompt=(
+            "Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>\n"
+            "Describe these two images separately."
+        ),
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForCausalLM,
+        image_size_factors=[(), (0.25,)],
+        marks=[
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
+                reason="This model is broken in Transformers v4.57.3",
+            )
+        ],
+    ),
     "phi3v": VLMTestInfo(
         models=["microsoft/Phi-3.5-vision-instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/generation/test_multimodal_gguf.py b/tests/models/multimodal/generation/test_multimodal_gguf.py
new file mode 100644
index 000000000000..e596b20c6302
--- /dev/null
+++ b/tests/models/multimodal/generation/test_multimodal_gguf.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Literal, NamedTuple
+
+import pytest
+from huggingface_hub import hf_hub_download
+from pytest import MarkDecorator
+
+from tests.quantization.utils import is_quant_method_supported
+from vllm.assets.image import ImageAsset
+from vllm.utils.torch_utils import set_default_torch_num_threads
+
+from ....conftest import PromptImageInput, VllmRunner
+from ...utils import check_logprobs_close
+
+
+class GGUFMMTestConfig(NamedTuple):
+    original_model: str
+    gguf_repo: str
+    gguf_backbone: str
+    gguf_mmproj: str
+    prompt: list[str]
+    mm_data: dict[Literal["images"], PromptImageInput]
+    max_model_len: int = 4096
+    marks: list[MarkDecorator] = []
+
+    @property
+    def gguf_model(self):
+        hf_hub_download(self.gguf_repo, filename=self.gguf_mmproj)
+        return hf_hub_download(self.gguf_repo, filename=self.gguf_backbone)
+
+
+GEMMA3_CONFIG = GGUFMMTestConfig(
+    original_model="google/gemma-3-4b-it",
+    gguf_repo="google/gemma-3-4b-it-qat-q4_0-gguf",
+    gguf_backbone="gemma-3-4b-it-q4_0.gguf",
+    gguf_mmproj="mmproj-model-f16-4B.gguf",
+    prompt=["<start_of_image>Describe this image in detail:"],
+    mm_data={"images": [ImageAsset("stop_sign").pil_image]},
+    marks=[pytest.mark.core_model],
+)
+
+MODELS_TO_TEST = [GEMMA3_CONFIG]
+
+
+def run_multimodal_gguf_test(
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+):
+    # Run gguf model.
+    with (
+        set_default_torch_num_threads(1),
+        vllm_runner(
+            model_name=model.gguf_model,
+            enforce_eager=True,
+            tokenizer_name=model.original_model,
+            dtype=dtype,
+            max_model_len=model.max_model_len,
+        ) as gguf_model,
+    ):
+        gguf_outputs = gguf_model.generate_greedy_logprobs(
+            prompts=model.prompt,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            **model.mm_data,
+        )
+
+    # Run unquantized model.
+    with vllm_runner(
+        model_name=model.original_model,
+        enforce_eager=True,  # faster tests
+        dtype=dtype,
+        max_model_len=model.max_model_len,
+    ) as original_model:
+        original_outputs = original_model.generate_greedy_logprobs(
+            prompts=model.prompt,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            **model.mm_data,
+        )
+
+    check_logprobs_close(
+        outputs_0_lst=original_outputs,
+        outputs_1_lst=gguf_outputs,
+        name_0="original",
+        name_1="gguf",
+    )
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("gguf"),
+    reason="gguf is not supported on this GPU type.",
+)
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(test_config, marks=test_config.marks)
+        for test_config in MODELS_TO_TEST
+    ],
+)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(
+    vllm_runner: type[VllmRunner],
+    model: GGUFMMTestConfig,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    run_multimodal_gguf_test(vllm_runner, model, dtype, max_tokens, num_logprobs)
diff --git a/tests/models/multimodal/generation/test_qwen2_5_vl.py b/tests/models/multimodal/generation/test_qwen2_5_vl.py
index 6b009075abfa..3ba665710af4 100644
--- a/tests/models/multimodal/generation/test_qwen2_5_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_5_vl.py
@@ -34,6 +34,7 @@ def qwen2_5_vl_chat_template(*query):
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
 def test_qwen2_5_vl_evs_functionality(
     vllm_runner,
     video_assets,
@@ -42,10 +43,14 @@ def test_qwen2_5_vl_evs_functionality(
     num_frames: int,
     dtype: str,
     max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
 ) -> None:
     """Test EVS (Efficient Video Sampling) functionality with different
     pruning rates.
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
 
     # Sample frames from video assets
     sampled_vids = [
@@ -86,6 +91,7 @@ def test_qwen2_5_vl_evs_functionality(
 @pytest.mark.parametrize("num_frames", [16])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("use_bytecode_hook", [True, False])
 def test_qwen2_5_vl_evs_batched_videos(
     vllm_runner,
     video_assets,
@@ -94,6 +100,8 @@ def test_qwen2_5_vl_evs_batched_videos(
     num_frames: int,
     dtype: str,
     max_tokens: int,
+    use_bytecode_hook: bool,
+    monkeypatch,
 ) -> None:
     """Test EVS functionality with batched videos.
 
@@ -102,6 +110,8 @@ def test_qwen2_5_vl_evs_batched_videos(
     2. Both pruning configurations work with multiple videos
     3. The model doesn't crash when processing multiple videos simultaneously
     """
+    # Set the environment variable for this test
+    monkeypatch.setenv("VLLM_USE_BYTECODE_HOOK", "1" if use_bytecode_hook else "0")
     # Sample frames from video assets
     sampled_vids = [
         sample_frames_from_video(asset.np_ndarrays, num_frames)
diff --git a/tests/models/multimodal/generation/test_qwen2_vl.py b/tests/models/multimodal/generation/test_qwen2_vl.py
index e10b8e1e77af..e1b7dbf99f1f 100644
--- a/tests/models/multimodal/generation/test_qwen2_vl.py
+++ b/tests/models/multimodal/generation/test_qwen2_vl.py
@@ -128,12 +128,7 @@ def get_image_embeds(model):
             visual = model.visual
 
             pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
-            image_grid_thw_on_device = image_grid_thw.to(
-                visual.device, dtype=torch.int64
-            )
-            return visual(
-                pixel_values_on_device, grid_thw=image_grid_thw_on_device
-            ).cpu()
+            return visual(pixel_values_on_device, grid_thw=image_grid_thw).cpu()
 
     image_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
@@ -217,12 +212,7 @@ def get_image_embeds(model):
             visual = model.visual
 
             pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
-            video_grid_thw_on_device = video_grid_thw.to(
-                visual.device, dtype=torch.int64
-            )
-            return visual(
-                pixel_values_on_device, grid_thw=video_grid_thw_on_device
-            ).cpu()
+            return visual(pixel_values_on_device, grid_thw=video_grid_thw).cpu()
 
     video_embeds = torch.concat(llm.apply_model(get_image_embeds))
 
diff --git a/tests/models/multimodal/generation/test_voxtral.py b/tests/models/multimodal/generation/test_voxtral.py
index 18a50c3a555d..9e9087cb0fc4 100644
--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
@@ -9,7 +9,7 @@
 from mistral_common.protocol.instruct.chunk import AudioChunk, RawAudio, TextChunk
 from mistral_common.protocol.instruct.messages import UserMessage
 
-from vllm.transformers_utils.tokenizer import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 from ....conftest import AudioTestAssets
 from ....utils import RemoteOpenAIServer
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 03ff3bcf6307..08cf4b2202dc 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -9,7 +9,7 @@
 from transformers.models.auto.auto_factory import _BaseAutoModelClass
 
 from vllm.config.model import RunnerOption
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .....conftest import HfRunner, VllmRunner
 from ....registry import HF_EXAMPLE_MODELS
@@ -33,7 +33,7 @@ def run_test(
     auto_cls: type[_BaseAutoModelClass],
     use_tokenizer_eos: bool,
     comparator: Callable[..., None],
-    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None,
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None,
     stop_str: list[str] | None,
     limit_mm_per_prompt: dict[str, int],
     vllm_runner_kwargs: dict[str, Any] | None,
diff --git a/tests/models/multimodal/generation/vlm_utils/types.py b/tests/models/multimodal/generation/vlm_utils/types.py
index 5c1bc6ac28fe..0c03c8449712 100644
--- a/tests/models/multimodal/generation/vlm_utils/types.py
+++ b/tests/models/multimodal/generation/vlm_utils/types.py
@@ -14,7 +14,7 @@
 
 from vllm.config.model import RunnerOption
 from vllm.logprobs import SampleLogprobs
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .....conftest import (
     AUDIO_ASSETS,
@@ -126,7 +126,7 @@ class VLMTestInfo(NamedTuple):
     vllm_runner_kwargs: dict[str, Any] | None = None
 
     # Optional callable which gets a list of token IDs from the model tokenizer
-    get_stop_token_ids: Callable[[AnyTokenizer], list[int]] | None = None
+    get_stop_token_ids: Callable[[TokenizerLike], list[int]] | None = None
     # Optional list of strings to stop generation, useful when stop tokens are
     # not special tokens in the tokenizer
     stop_str: list[str] | None = None
diff --git a/tests/models/multimodal/pooling/test_siglip.py b/tests/models/multimodal/pooling/test_siglip.py
index 3345b10c099a..92ae115a1983 100644
--- a/tests/models/multimodal/pooling/test_siglip.py
+++ b/tests/models/multimodal/pooling/test_siglip.py
@@ -19,7 +19,12 @@
     }
 )
 
-MODELS = ["google/siglip-base-patch16-224", "google/siglip2-base-patch16-224"]
+MODELS = [
+    "google/siglip-base-patch16-224",
+    "google/siglip2-base-patch16-224",
+    # Different image embedding dim than text_config.hidden_size
+    "google/siglip2-giant-opt-patch16-384",
+]
 
 
 def _run_test(
@@ -32,7 +37,12 @@ def _run_test(
     dtype: str,
 ) -> None:
     with vllm_runner(
-        model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=64
+        model,
+        runner="pooling",
+        dtype=dtype,
+        enforce_eager=True,
+        max_model_len=64,
+        gpu_memory_utilization=0.7,
     ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
@@ -129,6 +139,7 @@ def test_models_text_image_no_crash(
         dtype=dtype,
         enforce_eager=True,
         max_model_len=64,
+        gpu_memory_utilization=0.7,
     ) as vllm_model:
         with pytest.raises(ValueError, match="not both"):
             vllm_model.embed(texts, images=images)
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index 313ab2fa8038..c39e52210090 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -22,8 +22,8 @@
 from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
+from vllm.tokenizers import MistralTokenizer
 from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
     cached_tokenizer_from_config,
     encode_tokens,
 )
@@ -233,7 +233,7 @@ def _test_processing_correctness(
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    factories = model_cls._processor_factory
     ctx = InputProcessingContext(
         model_config,
         tokenizer=cached_tokenizer_from_config(model_config),
diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py
index 687d1ef349f8..66a3fbe11b6a 100644
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -30,6 +30,7 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
+from vllm.platforms import current_platform
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
@@ -176,6 +177,12 @@ def test_model_tensor_schema(model_id: str):
         exist_overrides=model_info.hf_overrides,
     )
 
+    # ROCm: Detect if model uses AWQ quantization and set appropriate dtype
+    if "awq" in model_id.lower() and current_platform.is_rocm():
+        dtype = "float16"
+    else:
+        dtype = model_info.dtype
+
     model_config = ModelConfig(
         model_id,
         tokenizer=model_info.tokenizer or model_id,
@@ -187,13 +194,13 @@ def test_model_tensor_schema(model_id: str):
         enable_prompt_embeds=model_info.require_embed_inputs,
         enable_mm_embeds=model_info.require_embed_inputs,
         enforce_eager=model_info.enforce_eager,
-        dtype=model_info.dtype,
+        dtype=dtype,
     )
 
     model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
     assert supports_multimodal(model_cls)
 
-    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
+    factories = model_cls._processor_factory
 
     inputs_parse_methods = []
     for attr_name in dir(model_cls):
diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py
index 2f38dc450ef9..0d2eaca95504 100644
--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -50,12 +50,24 @@ def test_hf_model_weights_mapper(model_arch: str):
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
+    is_mistral_model = model_arch in [
+        "Mistral3ForConditionalGeneration",
+        "PixtralForConditionalGeneration",
+        "VoxtralForConditionalGeneration",
+    ]
+
+    if not is_mistral_model or model_info.tokenizer_mode == "mistral":
+        tokenizer_mode = model_info.tokenizer_mode
+    else:
+        tokenizer_mode = "hf"
+
     model_id = model_info.default
 
     model_config = ModelConfig(
         model_id,
         tokenizer=model_info.tokenizer or model_id,
-        tokenizer_mode=model_info.tokenizer_mode,
+        tokenizer_mode=tokenizer_mode,
+        config_format="hf",
         revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         hf_overrides=model_info.hf_overrides,
diff --git a/tests/models/quantization/test_bitsandbytes.py b/tests/models/quantization/test_bitsandbytes.py
index 24220978534c..5b8aaa299fdc 100644
--- a/tests/models/quantization/test_bitsandbytes.py
+++ b/tests/models/quantization/test_bitsandbytes.py
@@ -14,10 +14,13 @@
 from ...utils import compare_two_settings, multi_gpu_test
 from ..utils import check_embeddings_close, check_logprobs_close
 
-pytestmark = pytest.mark.skipif(
-    current_platform.is_rocm(),
-    reason="bitsandbytes quantization not supported on ROCm (CUDA-only kernels)",
-)
+if current_platform.is_rocm():
+    from vllm.platforms.rocm import on_gfx9
+
+    pytestmark = pytest.mark.skipif(
+        on_gfx9(),
+        reason="bitsandbytes not supported on gfx9 (warp size 64 limitation)",
+    )
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
@@ -256,6 +259,9 @@ def validate_generated_texts(
         tensor_parallel_size=vllm_tp_size,
         enforce_eager=False,
         default_torch_num_threads=1,
+        tokenizer_mode="hf",
+        load_format="hf",
+        config_format="hf",
     ) as llm:
         vllm_outputs = llm.generate_greedy(prompts, max_tokens)
         vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")
diff --git a/tests/models/quantization/test_fp8.py b/tests/models/quantization/test_fp8.py
index 2a6f34a9c482..7dfedaf2799d 100644
--- a/tests/models/quantization/test_fp8.py
+++ b/tests/models/quantization/test_fp8.py
@@ -11,7 +11,6 @@
 from tests.quantization.utils import is_quant_method_supported
 from vllm.attention.utils.fa_utils import flash_attn_supports_fp8
 from vllm.platforms import current_platform
-from vllm.utils import STR_BACKEND_ENV_VAR
 from ..utils import check_logprobs_close
 
 
@@ -76,7 +75,7 @@ def test_models(
 
     with monkeypatch.context() as m:
         m.setenv("TOKENIZERS_PARALLELISM", "true")
-        m.setenv(STR_BACKEND_ENV_VAR, backend)
+        m.setenv("VLLM_ATTENTION_BACKEND", backend)
 
         MAX_MODEL_LEN = 1024
         NUM_LOG_PROBS = 8
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 5e2438857aee..3b9597507ac1 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -78,6 +78,12 @@ def gguf_model(self):
     gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
 )
 
+GEMMA3_CONFIG = GGUFTestConfig(
+    original_model="google/gemma-3-270m-it",
+    gguf_repo="ggml-org/gemma-3-270m-it-qat-GGUF",
+    gguf_filename="gemma-3-270m-it-qat-Q4_0.gguf",
+)
+
 MODELS = [
     # LLAMA_CONFIG, # broken: https://github.com/vllm-project/vllm/issues/19458
     QWEN2_CONFIG,
@@ -85,6 +91,7 @@ def gguf_model(self):
     GPT2_CONFIG,
     STABLELM_CONFIG,
     DOLPHIN_CONFIG,
+    GEMMA3_CONFIG,
     # STARCODER_CONFIG, # broken
 ]
 
@@ -148,7 +155,7 @@ def check_model_outputs(
     "model",
     [pytest.param(test_config, marks=test_config.marks) for test_config in MODELS],
 )
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("tp_size", [1])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 644d0619215f..d90f3a4d4f78 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -23,7 +23,7 @@ class _HfExamplesInfo:
     tokenizer: str | None = None
     """Set the tokenizer to load for this architecture."""
 
-    tokenizer_mode: TokenizerMode = "auto"
+    tokenizer_mode: TokenizerMode | str = "auto"
     """Set the tokenizer type for this architecture."""
 
     speculative_model: str | None = None
@@ -173,6 +173,10 @@ def check_available_online(
 
 _TEXT_GENERATION_EXAMPLE_MODELS = {
     # [Decoder-only]
+    "AfmoeForCausalLM": _HfExamplesInfo(
+        "arcee-ai/Trinity-Nano",
+        is_available_online=False,
+    ),
     "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-Instruct-2509"),
     "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", trust_remote_code=True),
     "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", trust_remote_code=True),
@@ -366,7 +370,7 @@ def check_available_online(
     ),
     "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
     "Olmo2ForCausalLM": _HfExamplesInfo("allenai/OLMo-2-0425-1B"),
-    "Olmo3ForCausalLM": _HfExamplesInfo("shanearora/2025-sep-a-base-model"),
+    "Olmo3ForCausalLM": _HfExamplesInfo("allenai/Olmo-3-7B-Instruct"),
     "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
     "OpenPanguMTPModel": _HfExamplesInfo(
         "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1",
@@ -398,6 +402,10 @@ def check_available_online(
         "pfnet/plamo-2-1b",
         trust_remote_code=True,
     ),
+    "Plamo3ForCausalLM": _HfExamplesInfo(
+        "pfnet/plamo-3-nict-2b-base",
+        trust_remote_code=True,
+    ),
     "QWenLMHeadModel": _HfExamplesInfo(
         "Qwen/Qwen-7B-Chat",
         max_transformers_version="4.53",
@@ -428,6 +436,9 @@ def check_available_online(
     "SolarForCausalLM": _HfExamplesInfo(
         "upstage/solar-pro-preview-instruct", trust_remote_code=True
     ),
+    "TeleChatForCausalLM": _HfExamplesInfo(
+        "chuhac/TeleChat2-35B", trust_remote_code=True
+    ),
     "TeleChat2ForCausalLM": _HfExamplesInfo(
         "Tele-AI/TeleChat2-3B", trust_remote_code=True
     ),
@@ -618,6 +629,10 @@ def check_available_online(
         "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
         trust_remote_code=True,
     ),
+    "HunYuanVLForConditionalGeneration": _HfExamplesInfo(
+        "tencent/HunyuanOCR",
+        is_available_online=False,
+    ),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo(
         "HuggingFaceM4/Idefics3-8B-Llama3",
         extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
@@ -717,6 +732,9 @@ def check_available_online(
     "NemotronH_Nano_VL_V2": _HfExamplesInfo(
         "nano_vl_dummy", is_available_online=False, trust_remote_code=True
     ),
+    "OpenCUAForConditionalGeneration": _HfExamplesInfo(
+        "xlangai/OpenCUA-7B", trust_remote_code=True
+    ),
     "Ovis": _HfExamplesInfo(
         "AIDC-AI/Ovis2-1B",
         trust_remote_code=True,
@@ -895,6 +913,10 @@ def check_available_online(
         "Qwen/Qwen2.5-VL-7B-Instruct",
         speculative_model="Rayzl/qwen2.5-vl-7b-eagle3-sgl",
     ),
+    "Eagle3Qwen3vlForCausalLM": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-8B-Instruct",
+        speculative_model="taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+    ),
     "Qwen3NextMTP": _HfExamplesInfo(
         "Qwen/Qwen3-Next-80B-A3B-Instruct", min_transformers_version="4.56.3"
     ),
diff --git a/tests/models/test_gguf_download.py b/tests/models/test_gguf_download.py
new file mode 100644
index 000000000000..155768ac9bff
--- /dev/null
+++ b/tests/models/test_gguf_download.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vllm.config import ModelConfig
+from vllm.config.load import LoadConfig
+from vllm.model_executor.model_loader.gguf_loader import GGUFModelLoader
+from vllm.model_executor.model_loader.weight_utils import download_gguf
+
+
+class TestGGUFDownload:
+    """Test GGUF model downloading functionality."""
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_single_file(self, mock_download):
+        """Test downloading a single GGUF file."""
+        # Setup mock
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return a single file
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/model-IQ1_S.gguf"] if "IQ1_S" in pattern else []
+            )
+
+            result = download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+            # Verify download_weights_from_hf was called with correct patterns
+            mock_download.assert_called_once_with(
+                model_name_or_path="unsloth/Qwen3-0.6B-GGUF",
+                cache_dir=None,
+                allow_patterns=[
+                    "*-IQ1_S.gguf",
+                    "*-IQ1_S-*.gguf",
+                    "*/*-IQ1_S.gguf",
+                    "*/*-IQ1_S-*.gguf",
+                ],
+                revision=None,
+                ignore_patterns=None,
+            )
+
+            # Verify result is the file path, not folder
+            assert result == f"{mock_folder}/model-IQ1_S.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_sharded_files(self, mock_download):
+        """Test downloading sharded GGUF files."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        # Mock glob to return sharded files
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [
+                    f"{mock_folder}/model-Q2_K-00001-of-00002.gguf",
+                    f"{mock_folder}/model-Q2_K-00002-of-00002.gguf",
+                ]
+                if "Q2_K" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            # Should return the first file after sorting
+            assert result == f"{mock_folder}/model-Q2_K-00001-of-00002.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    def test_download_gguf_subdir(self, mock_download):
+        """Test downloading GGUF files from subdirectory."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with patch("glob.glob") as mock_glob:
+            mock_glob.side_effect = lambda pattern, **kwargs: (
+                [f"{mock_folder}/Q2_K/model-Q2_K.gguf"]
+                if "Q2_K" in pattern or "**/*.gguf" in pattern
+                else []
+            )
+
+            result = download_gguf("unsloth/gpt-oss-120b-GGUF", "Q2_K")
+
+            assert result == f"{mock_folder}/Q2_K/model-Q2_K.gguf"
+
+    @patch("vllm.model_executor.model_loader.weight_utils.download_weights_from_hf")
+    @patch("glob.glob", return_value=[])
+    def test_download_gguf_no_files_found(self, mock_glob, mock_download):
+        """Test error when no GGUF files are found."""
+        mock_folder = "/tmp/mock_cache"
+        mock_download.return_value = mock_folder
+
+        with pytest.raises(ValueError, match="Downloaded GGUF files not found"):
+            download_gguf("unsloth/Qwen3-0.6B-GGUF", "IQ1_S")
+
+
+class TestGGUFModelLoader:
+    """Test GGUFModelLoader class methods."""
+
+    @patch("os.path.isfile", return_value=True)
+    def test_prepare_weights_local_file(self, mock_isfile):
+        """Test _prepare_weights with local file."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "/path/to/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/path/to/model.gguf"
+        mock_isfile.assert_called_once_with("/path/to/model.gguf")
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_https_url(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with HTTPS URL."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "https://huggingface.co/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            url="https://huggingface.co/model.gguf"
+        )
+
+    @patch("vllm.model_executor.model_loader.gguf_loader.hf_hub_download")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_filename(self, mock_isfile, mock_hf_download):
+        """Test _prepare_weights with repo_id/filename.gguf format."""
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_hf_download.return_value = "/downloaded/model.gguf"
+
+        # Create a simple mock ModelConfig with only the model attribute
+        model_config = MagicMock()
+        model_config.model = "unsloth/Qwen3-0.6B-GGUF/model.gguf"
+
+        result = loader._prepare_weights(model_config)
+        assert result == "/downloaded/model.gguf"
+        mock_hf_download.assert_called_once_with(
+            repo_id="unsloth/Qwen3-0.6B-GGUF", filename="model.gguf"
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.transformers_utils.config.file_or_path_exists", return_value=True)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=True)
+    @patch("vllm.model_executor.model_loader.gguf_loader.download_gguf")
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_repo_quant_type(
+        self,
+        mock_isfile,
+        mock_download_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_file_exists,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with repo_id:quant_type format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        mock_download_gguf.return_value = "/downloaded/model-IQ1_S.gguf"
+
+        model_config = ModelConfig(
+            model="unsloth/Qwen3-0.6B-GGUF:IQ1_S", tokenizer="Qwen/Qwen3-0.6B"
+        )
+        result = loader._prepare_weights(model_config)
+        # The actual result will be the downloaded file path from mock
+        assert result == "/downloaded/model-IQ1_S.gguf"
+        mock_download_gguf.assert_called_once_with(
+            "unsloth/Qwen3-0.6B-GGUF",
+            "IQ1_S",
+            cache_dir=None,
+            revision=None,
+            ignore_patterns=["original/**/*"],
+        )
+
+    @patch("vllm.config.model.get_hf_image_processor_config", return_value=None)
+    @patch("vllm.config.model.get_config")
+    @patch("vllm.config.model.is_gguf", return_value=False)
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    @patch("os.path.isfile", return_value=False)
+    def test_prepare_weights_invalid_format(
+        self,
+        mock_isfile,
+        mock_check_gguf,
+        mock_is_gguf,
+        mock_get_config,
+        mock_get_image_config,
+    ):
+        """Test _prepare_weights with invalid format."""
+        mock_hf_config = MagicMock()
+        mock_hf_config.architectures = ["Qwen3ForCausalLM"]
+
+        class MockTextConfig:
+            max_position_embeddings = 4096
+            sliding_window = None
+            model_type = "qwen3"
+            num_attention_heads = 32
+
+        mock_text_config = MockTextConfig()
+        mock_hf_config.get_text_config.return_value = mock_text_config
+        mock_hf_config.dtype = "bfloat16"
+        mock_get_config.return_value = mock_hf_config
+
+        load_config = LoadConfig(load_format="gguf")
+        loader = GGUFModelLoader(load_config)
+
+        # Create ModelConfig with a valid repo_id to avoid validation errors
+        # Then test _prepare_weights with invalid format
+        model_config = ModelConfig(model="unsloth/Qwen3-0.6B")
+        # Manually set model to invalid format after creation
+        model_config.model = "invalid-format"
+        with pytest.raises(ValueError, match="Unrecognised GGUF reference"):
+            loader._prepare_weights(model_config)
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index a18f5b607763..ae5befd2c00b 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Test the functionality of the Transformers backend."""
+"""Test the functionality of the Transformers modeling backend."""
 
 from typing import Any
 
@@ -85,7 +85,7 @@ def test_models(
     required = Version("5.0.0.dev")
     if model == "allenai/OLMoE-1B-7B-0924" and installed < required:
         pytest.skip(
-            "MoE models with the Transformers backend require "
+            "MoE models with the Transformers modeling backend require "
             f"transformers>={required}, but got {installed}"
         )
 
diff --git a/tests/multimodal/assets/corrupted.mp4 b/tests/multimodal/assets/corrupted.mp4
new file mode 100644
index 000000000000..c355bb932cee
Binary files /dev/null and b/tests/multimodal/assets/corrupted.mp4 differ
diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py
index 2f04bc6695c8..262ea42e4d0f 100644
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import time
 from contextlib import nullcontext
-from typing import cast
 
 import numpy as np
 import pytest
@@ -15,6 +15,7 @@
     PromptIndexTargets,
     PromptInsertion,
     PromptReplacement,
+    _apply_matches,
     apply_text_matches,
     apply_token_matches,
     find_mm_placeholders,
@@ -22,7 +23,6 @@
     replace_token_matches,
 )
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .utils import random_image
 
@@ -236,15 +236,12 @@ def test_find_token_matches(
     expected_by_key,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to token IDs
-    mock_tokenizer = cast(AnyTokenizer, object())
-
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
-        key: list(update.iter_token_matches(prompt, mock_tokenizer))
+        key: list(update.iter_token_matches(prompt, tokenizer=None))
         for key, update in prompt_updates.items()
     }
 
@@ -383,15 +380,12 @@ def test_find_text_matches(
     expected_by_key,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(AnyTokenizer, object())
-
     prompt_updates = {
         key: update_type(key, target, []).resolve(0)
         for key, target in target_by_key.items()
     }
     result = {
-        key: list(update.iter_text_matches(prompt, mock_tokenizer))
+        key: list(update.iter_text_matches(prompt, tokenizer=None))
         for key, update in prompt_updates.items()
     }
 
@@ -543,9 +537,6 @@ def test_find_update_text(
     repl_by_key,
     expected_by_update_type_mm_count,
 ):
-    # Should not be used since there is nothing to convert to text
-    mock_tokenizer = cast(AnyTokenizer, object())
-
     for (
         update_type,
         expected_by_mm_count,
@@ -562,7 +553,7 @@ def test_find_update_text(
             new_prompt, result = apply_text_matches(
                 prompt,
                 mm_prompt_updates,
-                mock_tokenizer,
+                tokenizer=None,
             )
 
             # Only displayed on error
@@ -748,9 +739,6 @@ def test_find_update_tokens(
     repl_by_key,
     expected_by_update_type_mm_count,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
-
     for (
         update_type,
         expected_by_mm_count,
@@ -767,7 +755,7 @@ def test_find_update_tokens(
             new_prompt, result = apply_token_matches(
                 prompt,
                 mm_prompt_updates,
-                mock_tokenizer,
+                tokenizer=None,
             )
 
             # Only displayed on error
@@ -898,15 +886,12 @@ def test_find_mm_placeholders(
     expected,
     update_type,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
-
     mm_prompt_updates = {
         key: [[update_type(key, [], repl).resolve(i)] for i in range(3)]
         for key, repl in repl_by_key.items()
     }
 
-    result = find_mm_placeholders(prompt, mm_prompt_updates, mock_tokenizer)
+    result = find_mm_placeholders(prompt, mm_prompt_updates, tokenizer=None)
 
     # Only displayed on error
     print("result:", result)
@@ -1027,12 +1012,9 @@ def test_hf_processor_init_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
-
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=mock_tokenizer,
+        tokenizer=None,
     )
 
     processor = ctx.get_hf_processor(
@@ -1063,15 +1045,43 @@ def test_hf_processor_call_kwargs(
     inference_kwargs,
     expected_kwargs,
 ):
-    # Should not be used since there is nothing to convert to tokens
-    mock_tokenizer = cast(AnyTokenizer, object())
-
     ctx = InputProcessingContext(
         model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
-        tokenizer=mock_tokenizer,
+        tokenizer=None,
     )
 
     processor = ctx.get_hf_processor(DummyProcessor)  # type: ignore[arg-type]
 
     result = ctx.call_hf_processor(processor, {}, inference_kwargs)
     assert result == expected_kwargs
+
+
+def test_apply_matches_no_match_exits_quickly():
+    """
+    Test that _apply_matches exits quickly when no matches are found.
+
+    Previously, _apply_matches had O(n²) behavior when no match was found
+    because it would increment start_idx by 1 each iteration while
+    re-scanning the entire prompt from prev_end_idx=0.
+
+    With the fix, it should exit immediately when no match is found.
+    """
+    # Create a long prompt with no placeholder
+    long_prompt = "x" * 10000
+
+    # Create update looking for a placeholder that doesn't exist
+    mm_prompt_updates = {
+        "image": [[PromptReplacement("image", "<image>", "REPLACED").resolve(0)]]
+    }
+
+    start = time.perf_counter()
+    result, _ = _apply_matches(
+        long_prompt,
+        mm_prompt_updates,
+        tokenizer=None,
+    )
+    elapsed = time.perf_counter() - start
+
+    # Should complete in < 100ms (was taking seconds before the fix)
+    assert elapsed < 0.1, f"_apply_matches took {elapsed:.2f}s, expected < 0.1s"
+    assert "".join(result) == long_prompt
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index ea795fcbbde5..639e290406fe 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -16,10 +16,10 @@
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 TEST_VIDEO_URLS = [
diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py
index 6572616769a9..6ed21de368ac 100644
--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -18,6 +18,7 @@
 
 pytestmark = pytest.mark.cpu_test
 
+ASSETS_DIR = Path(__file__).parent / "assets"
 NUM_FRAMES = 10
 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
 FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -140,3 +141,39 @@ def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
             )
             assert np.sum(np.isnan(sim)) / sim.size < 0.001
             assert np.nanmean(sim) > 0.99
+
+
+def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
+    """
+    Regression test for handling videos with broken frames.
+    This test uses a pre-corrupted video file (assets/corrupted.mp4) that
+    contains broken/unreadable frames to verify the video loader handles
+    them gracefully without crashing and returns accurate metadata.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        # Load the pre-corrupted video file that contains broken frames
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+        frames, metadata = loader.load_bytes(video_data, num_frames=-1)
+
+        # Verify metadata consistency:
+        # frames_indices must match actual loaded frames
+        assert frames.shape[0] == len(metadata["frames_indices"]), (
+            f"Frames array size must equal frames_indices length. "
+            f"Got {frames.shape[0]} frames but "
+            f"{len(metadata['frames_indices'])} indices"
+        )
+
+        # Verify that broken frames were skipped:
+        # loaded frames should be less than total
+        assert frames.shape[0] < metadata["total_num_frames"], (
+            f"Should load fewer frames than total due to broken frames. "
+            f"Expected fewer than {metadata['total_num_frames']} frames, "
+            f"but loaded {frames.shape[0]} frames"
+        )
diff --git a/tests/plugins/lora_resolvers/test_filesystem_resolver.py b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
index cd98efdd1390..d4adf6f84cf0 100644
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
@@ -8,8 +8,8 @@
 
 from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
 
-MODEL_NAME = "mistralai/Mistral-7B-v0.1"
-LORA_NAME = "typeof/zephyr-7b-beta-lora"
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+LORA_NAME = "charent/self_cognition_Alice"
 PA_NAME = "swapnilbp/llama_tweet_ptune"
 
 
@@ -21,7 +21,7 @@ def adapter_cache(request, tmpdir_factory):
 
 
 @pytest.fixture(scope="module")
-def zephyr_lora_files():
+def qwen3_lora_files():
     return snapshot_download(repo_id=LORA_NAME)
 
 
@@ -31,9 +31,9 @@ def pa_files():
 
 
 @pytest.mark.asyncio
-async def test_filesystem_resolver(adapter_cache, zephyr_lora_files):
+async def test_filesystem_resolver(adapter_cache, qwen3_lora_files):
     model_files = adapter_cache / LORA_NAME
-    shutil.copytree(zephyr_lora_files, model_files)
+    shutil.copytree(qwen3_lora_files, model_files)
 
     fs_resolver = FilesystemResolver(adapter_cache)
     assert fs_resolver is not None
diff --git a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
index 5614f19d1a4f..e38a79de367e 100644
--- a/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
+++ b/tests/plugins/prithvi_io_processor_plugin/prithvi_io_processor/prithvi_processor.py
@@ -18,7 +18,10 @@
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
 
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorRequest, IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorRequest,
+    IOProcessorResponse,
+)
 from vllm.inputs.data import PromptType
 from vllm.logger import init_logger
 from vllm.outputs import PoolingRequestOutput
diff --git a/tests/plugins_tests/test_io_processor_plugins.py b/tests/plugins_tests/test_io_processor_plugins.py
index 582cf9a0711b..2088ee36e89a 100644
--- a/tests/plugins_tests/test_io_processor_plugins.py
+++ b/tests/plugins_tests/test_io_processor_plugins.py
@@ -7,7 +7,7 @@
 
 from tests.utils import RemoteOpenAIServer
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
 from vllm.plugins.io_processors import get_io_processor
 
 MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index e7d902ed26aa..412b21328a32 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -10,6 +10,7 @@
 from compressed_tensors.quantization import QuantizationType
 
 from tests.models.utils import check_logprobs_close
+from vllm.model_executor.layers.fused_moe import UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensors24,
     CompressedTensorsLinearMethod,
@@ -141,7 +142,7 @@ def zp_valid(zp: torch.Tensor | None):
         "neuralmagic/Llama-3.2-1B-quantized.w8a8",
     ],
 )
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [4])
 @pytest.mark.parametrize("num_logprobs", [10])
 @pytest.mark.parametrize(
     "use_aiter", [True, False] if current_platform.is_rocm() else [False]
@@ -182,7 +183,7 @@ def test_compressed_tensors_w8a8_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
 
-    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+    with vllm_runner(model_path, dtype=dtype, enforce_eager=True) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs
         )
@@ -767,3 +768,50 @@ def check_model(model):
 
         output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="This test is not for non-CUDA platforms",
+)
+def test_compressed_tensors_moe_ignore_with_model(vllm_runner):
+    """
+    Integration test for MoE layer ignore functionality with a real model.
+
+    This test would verify that when loading a compressed-tensors quantized
+    MoE model where some MoE layers are in the ignore list, those layers
+    use UnquantizedFusedMoEMethod while non-ignored layers use the
+    quantized method.
+
+    Expected model structure:
+    - Compressed-tensors quantized MoE model (e.g., Mixtral-based)
+    - Config with ignore list containing specific MoE layers
+    - Multiple MoE layers where some are quantized and some are not
+    """
+
+    # model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only" # CT 12.3
+    model_path = "nm-testing/tinysmokeqwen3moe-W4A16-first-only-CTstable"  # CT 12.2
+
+    with vllm_runner(model_path, enforce_eager=True) as llm:
+
+        def check_model(model):
+            from vllm.model_executor.layers.fused_moe import FusedMoE
+            from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (  # noqa: E501
+                CompressedTensorsMoEMethod,
+            )
+
+            # Check layer 0 MoE (should be quantized)
+            layer_quantized = model.model.layers[0].mlp.experts
+            assert isinstance(layer_quantized, FusedMoE)
+            assert isinstance(layer_quantized.quant_method, CompressedTensorsMoEMethod)
+
+            # Check layer 10 MoE (should be unquantized + ignored)
+            layer_unquantized = model.model.layers[3].mlp.experts
+            assert isinstance(layer_unquantized, FusedMoE)
+            assert isinstance(layer_unquantized.quant_method, UnquantizedFusedMoEMethod)
+
+        llm.apply_model(check_model)
+
+        # Verify the model can generate output
+        output = llm.generate_greedy("Hello, my name is", max_tokens=4)
+        assert output
diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py
index a3fb4a695347..1591ce1c4f5a 100644
--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -19,8 +19,8 @@ def test_cpu_offload_fp8():
     # Test loading a quantized checkpoint
     compare_two_settings(
         "neuralmagic/Qwen2-1.5B-Instruct-FP8",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -35,8 +35,8 @@ def test_cpu_offload_gptq(monkeypatch):
     # Test GPTQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -51,8 +51,8 @@ def test_cpu_offload_awq(monkeypatch):
     # Test AWQ Marlin
     compare_two_settings(
         "Qwen/Qwen2-1.5B-Instruct-AWQ",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
 
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
     # Test wNa16
     compare_two_settings(
         "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
-        [],
-        ["--cpu-offload-gb", "1"],
+        ["--enforce_eager"],
+        ["--enforce_eager", "--cpu-offload-gb", "1"],
         max_wait_seconds=480,
     )
diff --git a/tests/quantization/test_cpu_wna16.py b/tests/quantization/test_cpu_wna16.py
new file mode 100644
index 000000000000..077b802e559d
--- /dev/null
+++ b/tests/quantization/test_cpu_wna16.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_cpu():
+    pytest.skip("skipping CPU-only tests", allow_module_level=True)
+
+MODELS = [
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",  # with g_idx
+]
+DTYPE = ["bfloat16"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", DTYPE)
+def test_ipex_quant(vllm_runner, model, dtype):
+    with vllm_runner(model, dtype=dtype) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+    assert output
+    print(output)
diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py
index 2a72f734e431..b992e976ac30 100644
--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -21,7 +21,7 @@
 )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [4])
 def test_model_experts_int8_startup(
     hf_runner,
     vllm_runner,
@@ -33,5 +33,7 @@ def test_model_experts_int8_startup(
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
     model_info.check_transformers_version(on_fail="skip")
 
-    with vllm_runner(model, dtype=dtype, quantization="experts_int8") as vllm_model:
+    with vllm_runner(
+        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+    ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index f02da2996ffe..7bcac9ad768e 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -45,10 +45,10 @@ def test_model_load_and_run(
     if force_marlin:
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
-    with vllm_runner(model_id) as llm:
+    with vllm_runner(model_id, enforce_eager=True) as llm:
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -85,7 +85,7 @@ def test_kv_cache_model_load_and_run(
 
     # `LLM.apply_model` requires pickling a function.
     monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
+    with vllm_runner(model_id, kv_cache_dtype="fp8", enforce_eager=True) as llm:
 
         def check_model(model):
             attn = model.model.layers[0].self_attn.attn
@@ -112,7 +112,7 @@ def check_model(model):
 
         # note: this does not test accuracy, just that we can run through
         # see lm-eval tests for accuracy
-        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=10)
+        outputs = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         print(outputs[0][1])
 
 
@@ -142,7 +142,10 @@ def test_load_fp16_model(
         monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
 
     with vllm_runner(
-        "facebook/opt-125m", quantization="fp8", kv_cache_dtype=kv_cache_dtype
+        "facebook/opt-125m",
+        quantization="fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
     ) as llm:
 
         def check_model(model):
diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py
index ae9b1df3377d..4f3c52df6c28 100644
--- a/tests/quantization/test_ipex_quant.py
+++ b/tests/quantization/test_ipex_quant.py
@@ -26,7 +26,7 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", DTYPE)
 def test_ipex_quant(vllm_runner, model, dtype):
-    with vllm_runner(model, dtype=dtype) as llm:
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+    with vllm_runner(model, dtype=dtype, enforce_eager=True) as llm:
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
     assert output
     print(output)
diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py
index f009a4cfb870..d92dfaa2cc7b 100644
--- a/tests/quantization/test_lm_head.py
+++ b/tests/quantization/test_lm_head.py
@@ -49,4 +49,4 @@ def check_model(model):
 
         vllm_model.apply_model(check_model)
 
-        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=10)[0][1])
+        print(vllm_model.generate_greedy(["Hello my name is"], max_tokens=4)[0][1])
diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py
index 8abf65d29784..0298994c396f 100644
--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -88,6 +88,6 @@ def check_model(model):
         llm.apply_model(check_model)
 
         # Run a simple generation test to ensure the model works
-        output = llm.generate_greedy(["Hello my name is"], max_tokens=20)
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
         assert output
         print(f"ModelOpt FP8 output: {output}")
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
index e8ea4148585b..61efd2ce66c7 100644
--- a/tests/quantization/test_ptpc_fp8.py
+++ b/tests/quantization/test_ptpc_fp8.py
@@ -38,6 +38,7 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
             "facebook/opt-125m",
             dtype=dtype,
             quantization="ptpc_fp8",
+            enforce_eager=True,
             kv_cache_dtype=kv_cache_dtype,
         )
     except AssertionError as e:
@@ -65,5 +66,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=4)
         assert output
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index aeef4c2fd8a7..a09856c78559 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -7,6 +7,7 @@
 Run `pytest tests/quantization/test_register_quantization_config.py`.
 """
 
+import logging
 from typing import Any
 
 import pytest
@@ -22,8 +23,8 @@
     get_quantization_config,
     register_quantization_config,
 )
-from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
-    QuantizationConfig,
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,  # noqa: E501
 )
 
 
@@ -100,17 +101,22 @@ def get_quant_method(
         return None
 
 
-def test_register_quantization_config():
+def test_register_quantization_config(caplog_vllm):
     """Test register custom quantization config."""
 
     # The quantization method `custom_quant` should be registered.
     assert get_quantization_config("custom_quant") == CustomQuantConfig
 
     # The quantization method `custom_quant` is already exists,
-    # should raise an error.
-    with pytest.raises(ValueError):
+    # should raise a warning when re-registering it.
+    with caplog_vllm.at_level(logging.WARNING):
         register_quantization_config("custom_quant")(CustomQuantConfig)
 
+    assert any(
+        "The quantization method 'custom_quant' already exists" in message
+        for message in caplog_vllm.messages
+    ), "Expected a warning when re-registering custom_quant"
+
 
 @pytest.mark.parametrize(
     argnames="model",
@@ -136,5 +142,5 @@ def check_model(model):
 
         llm.apply_model(check_model)
 
-        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        output = llm.generate_greedy("Hello my name is", max_tokens=1)
         assert output
diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py
index 82413f36e997..f35c3973ab6e 100644
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -225,13 +225,12 @@ def test_reload_weights():
 @pytest.mark.skip(
     reason="since torchao nightly is only compatible with torch nightly"
     "currently https://github.com/pytorch/ao/issues/2919, we'll have to skip "
-    "torchao tests that requires newer versions (0.14.0.dev+) for now"
+    "torchao tests that requires newer versions (0.15.0.dev+) for now"
 )
-def test_opt_125m_float8_weight_only_safetensors_model_loading_with_params(vllm_runner):
+def test_safetensors_model_loading_with_params(vllm_runner):
     torch._dynamo.reset()
-    model_name = (
-        "torchao-testing/opt-125m-Float8WeightOnlyConfig-v2-0.14.0.dev-safetensors"
-    )
+    # using this model to test safetensors loading with file sharding
+    model_name = "torchao-testing/Qwen3-8B-INT4-0.15.0dev-safetensors"
     with vllm_runner(model_name=model_name, dtype="bfloat16") as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
@@ -392,7 +391,7 @@ def get_weight_attrs(model):
             assert not has_int4_preshuffled_tensor
 
         assert weight_attrs == [False, 1, 0, True]
-        output = llm.generate_greedy(["The capital of France is"], max_tokens=32)
+        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
 
         assert output
 
diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py
index 5163c863863a..0fe315c2567f 100644
--- a/tests/reasoning/test_mistral_reasoning_parser.py
+++ b/tests/reasoning/test_mistral_reasoning_parser.py
@@ -5,7 +5,7 @@
 
 from tests.reasoning.utils import run_reasoning_extraction_mistral
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 parser_name = "mistral"
 
diff --git a/tests/reasoning/utils.py b/tests/reasoning/utils.py
index bd0b230a847c..695312a0cadf 100644
--- a/tests/reasoning/utils.py
+++ b/tests/reasoning/utils.py
@@ -4,7 +4,7 @@
 
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
 from vllm.reasoning import ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 
 class StreamingReasoningReconstructor:
diff --git a/tests/rocm/aiter/test_grouped_quant.py b/tests/rocm/aiter/test_grouped_quant.py
new file mode 100644
index 000000000000..c7f0f1eda355
--- /dev/null
+++ b/tests/rocm/aiter/test_grouped_quant.py
@@ -0,0 +1,137 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# This is a test for the AITER group_fp8_quant op.
+# It tests if the AITER op is
+# 1. correctly defined the relationship between
+#    implementation and fake function
+# 2. can be used with torch.compile
+# 3. can be used with CUDA graphs
+# This file will be skipped if AITER is not installed
+# and the platform is not ROCm.
+
+import importlib.util
+
+import pytest
+import torch
+
+# this import statement is needed to ensure the ops are registered
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.platforms import current_platform
+
+# Check if aiter package is installed
+aiter_available = importlib.util.find_spec("aiter") is not None
+
+pytestmark = pytest.mark.skipif(
+    not (current_platform.is_rocm() and aiter_available),
+    reason="AITER ops are only available on ROCm with aiter package installed",
+)
+
+
+def test_rocm_aiter_group_fp8_quant_fake_implementation():
+    """Test that the fake implementation is correctly
+    defined for torch.ops.vllm.rocm_aiter_group_fp8_quant."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Verify the op's fake implementation using torch.library.opcheck
+    # This checks that the fake function returns tensors with correct shapes and dtypes
+    torch.library.opcheck(
+        torch.ops.vllm.rocm_aiter_group_fp8_quant,
+        (input_tensor, group_size),
+        test_utils=("test_faketensor",),
+    )
+
+
+def test_rocm_aiter_group_fp8_quant_torch_compile_with_cudagraph():
+    """Test that rocm_aiter_ops.group_fp8_quant
+    with group size 128 can be used with
+    torch.compile in cudagraph mode."""
+    # Create test tensors
+    M = 128
+    N = 4096
+    group_size = 128
+
+    input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+    # Define a function that uses the op
+    def group_fp8_quant_fn(x):
+        return rocm_aiter_ops.group_fp8_quant(x, group_size)
+
+    # Compile with cudagraph mode
+    compiled_fn = torch.compile(
+        group_fp8_quant_fn,
+        fullgraph=True,
+        backend="inductor",
+        mode="reduce-overhead",
+        dynamic=False,
+    )
+
+    # Run eager mode
+    x_fp8_eager, scales_eager = group_fp8_quant_fn(input_tensor)
+
+    # Run compiled version (first run will trigger compilation)
+    x_fp8_compiled, scales_compiled = compiled_fn(input_tensor)
+
+    # Verify shapes match
+    assert x_fp8_compiled.shape == x_fp8_eager.shape
+    assert scales_compiled.shape == scales_eager.shape
+
+    # Verify expected shapes
+    assert x_fp8_compiled.shape == (M, N)
+    expected_scale_cols = (N + group_size - 1) // group_size
+    assert scales_compiled.shape == (M, expected_scale_cols)
+
+    # Verify results match
+    assert torch.allclose(
+        x_fp8_compiled.to(torch.float32),
+        x_fp8_eager.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled, scales_eager, rtol=1e-3, atol=1e-3)
+
+    # Test with different input (reusing compiled graph)
+    input_tensor_2 = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+    x_fp8_eager_2, scales_eager_2 = group_fp8_quant_fn(input_tensor_2)
+    x_fp8_compiled_2, scales_compiled_2 = compiled_fn(input_tensor_2)
+
+    # Verify second run also produces correct results
+    assert torch.allclose(
+        x_fp8_compiled_2.to(torch.float32),
+        x_fp8_eager_2.to(torch.float32),
+        rtol=1e-2,
+        atol=1e-2,
+    )
+    assert torch.allclose(scales_compiled_2, scales_eager_2, rtol=1e-3, atol=1e-3)
+
+
+def test_rocm_aiter_group_fp8_quant_different_shapes():
+    """Test rocm_aiter_ops.group_fp8_quant with different input shapes."""
+    group_size = 128
+
+    test_shapes = [
+        (64, 2048),
+        (256, 8192),
+        (32, 1024),
+        (512, 4096),
+    ]
+
+    for M, N in test_shapes:
+        input_tensor = torch.randn((M, N), dtype=torch.bfloat16, device="cuda")
+
+        x_fp8, scales = rocm_aiter_ops.group_fp8_quant(input_tensor, group_size)
+
+        # Verify shapes
+        assert x_fp8.shape == (M, N)
+        expected_scale_cols = (N + group_size - 1) // group_size
+        assert scales.shape == (M, expected_scale_cols)
+
+        # Verify dtypes
+        from aiter import dtypes
+
+        assert x_fp8.dtype == dtypes.fp8
+        assert scales.dtype == torch.float32
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index c9d227599cde..ea40c4802720 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -24,9 +24,7 @@ def test_ranks(
     greedy,
     flat_logprobs,
     example_prompts,
-    monkeypatch: pytest.MonkeyPatch,
 ):
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1" if flat_logprobs else "0")
     with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model:
         tokenizer = vllm_model.llm.get_tokenizer()
         example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts]
@@ -36,6 +34,7 @@ def test_ranks(
             max_tokens=MAX_TOKENS,
             logprobs=NUM_TOP_LOGPROBS,
             prompt_logprobs=NUM_PROMPT_LOGPROBS,
+            flat_logprobs=flat_logprobs,
         )
         results = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
 
diff --git a/tests/test_config.py b/tests/test_config.py
index bba2fbec3db2..112b02edd038 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+import logging
 import os
 from dataclasses import MISSING, Field, asdict, dataclass, field
 from unittest.mock import patch
@@ -8,9 +8,20 @@
 import pytest
 
 from vllm.compilation.backends import VllmBackend
-from vllm.config import ModelConfig, PoolerConfig, VllmConfig, update_config
+from vllm.config import (
+    CompilationConfig,
+    ModelConfig,
+    PoolerConfig,
+    VllmConfig,
+    update_config,
+)
+from vllm.config.compilation import CompilationMode, CUDAGraphMode
 from vllm.config.load import LoadConfig
 from vllm.config.utils import get_field
+from vllm.config.vllm import (
+    OPTIMIZATION_LEVEL_TO_CONFIG,
+    OptimizationLevel,
+)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
@@ -235,6 +246,43 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type):
     assert model_config.pooler_config.pooling_type == pooling_type
 
 
+@pytest.mark.parametrize(
+    ("model_id", "expected_is_moe_model"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", False),
+        ("RedHatAI/Llama-3.2-1B-FP8", False),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", False),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Llama-4-Scout-17B-16E-Instruct", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", True),
+    ],
+)
+def test_moe_model_detection(model_id, expected_is_moe_model):
+    model_config = ModelConfig(model_id)
+    # Just check that is_moe_model field exists and is a boolean
+    assert model_config.is_model_moe() == expected_is_moe_model
+
+
+@pytest.mark.parametrize(
+    ("model_id", "quantized"),
+    [
+        ("RedHatAI/Qwen3-8B-speculator.eagle3", False),
+        ("RedHatAI/Llama-3.1-8B-Instruct-NVFP4", True),
+        ("RedHatAI/Llama-3.2-1B-FP8", True),
+        ("RedHatAI/Mistral-Small-24B-Instruct-2501-quantized.w8a8", True),
+        ("RedHatAI/gpt-oss-20b", True),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", True),
+        ("RedHatAI/Mixtral-8x7B-Instruct-v0.1", False),
+    ],
+)
+def test_is_quantized(model_id, quantized):
+    model_config = ModelConfig(model_id)
+    # Just check that quantized field exists and is a boolean
+    assert model_config.is_quantized() == quantized
+
+
 @pytest.mark.skipif(
     current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm."
 )
@@ -249,45 +297,48 @@ def test_get_bert_tokenization_sentence_transformer_config():
 
 
 def test_rope_customization():
-    TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
-    TEST_ROPE_THETA = 16_000_000.0
-    LONGCHAT_ROPE_SCALING = {"rope_type": "linear", "factor": 8.0}
+    TEST_ROPE_PARAMETERS = {
+        "rope_theta": 16_000_000.0,
+        "rope_type": "dynamic",
+        "factor": 2.0,
+    }
+    LLAMA_ROPE_PARAMETERS = {"rope_theta": 500000.0, "rope_type": "default"}
+    LONGCHAT_ROPE_PARAMETERS = {"rope_type": "linear", "factor": 8.0}
 
     llama_model_config = ModelConfig("meta-llama/Meta-Llama-3-8B-Instruct")
-    assert getattr(llama_model_config.hf_config, "rope_scaling", None) is None
-    assert getattr(llama_model_config.hf_config, "rope_theta", None) == 500_000
+    assert (
+        getattr(llama_model_config.hf_config, "rope_parameters", None)
+        == LLAMA_ROPE_PARAMETERS
+    )
     assert llama_model_config.max_model_len == 8192
 
     llama_model_config = ModelConfig(
         "meta-llama/Meta-Llama-3-8B-Instruct",
-        hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
-            "rope_theta": TEST_ROPE_THETA,
-        },
+        hf_overrides={"rope_parameters": TEST_ROPE_PARAMETERS},
     )
     assert (
-        getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING
+        getattr(llama_model_config.hf_config, "rope_parameters", None)
+        == TEST_ROPE_PARAMETERS
     )
-    assert getattr(llama_model_config.hf_config, "rope_theta", None) == TEST_ROPE_THETA
     assert llama_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig("lmsys/longchat-13b-16k")
-    # Check if LONGCHAT_ROPE_SCALING entries are in longchat_model_config
+    # Check if LONGCHAT_ROPE_PARAMETERS entries are in longchat_model_config
     assert all(
-        longchat_model_config.hf_config.rope_scaling.get(key) == value
-        for key, value in LONGCHAT_ROPE_SCALING.items()
+        longchat_model_config.hf_config.rope_parameters.get(key) == value
+        for key, value in LONGCHAT_ROPE_PARAMETERS.items()
     )
     assert longchat_model_config.max_model_len == 16384
 
     longchat_model_config = ModelConfig(
         "lmsys/longchat-13b-16k",
         hf_overrides={
-            "rope_scaling": TEST_ROPE_SCALING,
+            "rope_parameters": TEST_ROPE_PARAMETERS,
         },
     )
     assert (
-        getattr(longchat_model_config.hf_config, "rope_scaling", None)
-        == TEST_ROPE_SCALING
+        getattr(longchat_model_config.hf_config, "rope_parameters", None)
+        == TEST_ROPE_PARAMETERS
     )
     assert longchat_model_config.max_model_len == 4096
 
@@ -549,3 +600,498 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
     assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
     assert os.path.exists(config2.model) and os.path.isdir(config2.model)
     assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_attn_type", "expected_result", "reason"),
+    [
+        # pooling models
+        (
+            "jason9693/Qwen2.5-1.5B-apeach",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen3-Embedding-0.6B",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            "decoder",
+            False,
+            "Pooling models with step pooling does not support chunked prefill.",
+        ),
+        (
+            "internlm/internlm2-1_8b-reward",
+            "decoder",
+            False,
+            "Pooling models with all pooling does not support chunked prefill.",
+        ),
+        (
+            "BAAI/bge-base-en",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "boltuix/NeuroBERT-NER",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "papluca/xlm-roberta-base-language-detection",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        (
+            "intfloat/e5-small",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        # multimodal models
+        (
+            "openai/clip-vit-base-patch32",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support chunked prefill.",
+        ),
+        (
+            "google/siglip-base-patch16-224",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support chunked prefill.",
+        ),
+        # generate models
+        (
+            "Qwen/Qwen3-0.6B",
+            "decoder",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "hybrid",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "ibm-granite/granite-4.0-h-small",
+            "hybrid",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        (
+            "state-spaces/mamba-130m-hf",
+            "attention_free",
+            True,
+            "Generative models support chunked prefill.",
+        ),
+        # encoder_decoder models
+        (
+            "openai/whisper-small",
+            "encoder_decoder",
+            False,
+            "Encoder decoder models does not support chunked prefill.",
+        ),
+    ],
+)
+def test_is_chunked_prefill_supported(
+    model_id: str,
+    expected_attn_type: str,
+    expected_result: bool,
+    reason: str,
+    caplog_vllm,
+):
+    model_config = ModelConfig(model_id, trust_remote_code=True)
+    assert model_config.attn_type == expected_attn_type
+    with caplog_vllm.at_level(level=logging.DEBUG):
+        assert model_config.is_chunked_prefill_supported == expected_result
+    assert reason in caplog_vllm.text
+
+
+@pytest.mark.parametrize(
+    ("model_id", "expected_attn_type", "expected_result", "reason"),
+    [
+        # pooling models
+        (
+            "jason9693/Qwen2.5-1.5B-apeach",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen3-Embedding-0.6B",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen2.5-Math-PRM-7B",
+            "decoder",
+            False,
+            "Pooling models with step pooling does not support prefix caching.",
+        ),
+        (
+            "internlm/internlm2-1_8b-reward",
+            "decoder",
+            False,
+            "Pooling models with all pooling does not support prefix caching.",
+        ),
+        (
+            "BAAI/bge-base-en",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "boltuix/NeuroBERT-NER",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "papluca/xlm-roberta-base-language-detection",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "Alibaba-NLP/gte-Qwen2-1.5B-instruct",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        (
+            "intfloat/e5-small",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        # multimodal models
+        (
+            "openai/clip-vit-base-patch32",
+            "decoder",
+            True,
+            "Pooling models with causal attn and last pooling support prefix caching.",
+        ),
+        (
+            "google/siglip-base-patch16-224",
+            "encoder_only",
+            False,
+            "Pooling models with bidirectional attn does not support prefix caching.",
+        ),
+        # generate models
+        (
+            "Qwen/Qwen3-0.6B",
+            "decoder",
+            True,
+            "Generative models support prefix caching.",
+        ),
+        (
+            "Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "hybrid",
+            False,
+            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        (
+            "ibm-granite/granite-4.0-h-small",
+            "hybrid",
+            False,
+            "Hybrid models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        (
+            "state-spaces/mamba-130m-hf",
+            "attention_free",
+            False,
+            "Attention free models does not support prefix caching since the feature is still experimental.",  # noqa: E501
+        ),
+        # encoder_decoder models
+        (
+            "openai/whisper-small",
+            "encoder_decoder",
+            False,
+            "Encoder decoder models does not support prefix caching.",
+        ),
+    ],
+)
+def test_is_prefix_caching_supported(
+    model_id: str,
+    expected_attn_type: str,
+    expected_result: bool,
+    reason: str,
+    caplog_vllm,
+):
+    model_config = ModelConfig(model_id, trust_remote_code=True)
+    assert model_config.attn_type == expected_attn_type
+    with caplog_vllm.at_level(level=logging.DEBUG):
+        assert model_config.is_prefix_caching_supported == expected_result
+    assert reason in caplog_vllm.text
+
+
+@pytest.mark.parametrize(
+    ("backend", "custom_ops", "expected"),
+    [
+        ("eager", [], True),
+        ("eager", ["+fused_layernorm"], True),
+        ("eager", ["all", "-fused_layernorm"], False),
+        ("inductor", [], False),
+        ("inductor", ["none", "+fused_layernorm"], True),
+        ("inductor", ["none", "-fused_layernorm"], False),
+    ],
+)
+def test_is_custom_op_enabled(backend: str, custom_ops: list[str], expected: bool):
+    """Test that is_custom_op_enabled works correctly."""
+    config = VllmConfig(
+        compilation_config=CompilationConfig(backend=backend, custom_ops=custom_ops)
+    )
+    assert config.compilation_config.is_custom_op_enabled("fused_layernorm") is expected
+
+
+def test_vllm_config_defaults_are_none():
+    """Verify that optimization-level defaults are None when not set by user."""
+    # Test all optimization levels to ensure defaults work correctly
+    for opt_level in OptimizationLevel:
+        config = object.__new__(VllmConfig)
+        config.compilation_config = CompilationConfig()
+        config.optimization_level = opt_level
+        config.model_config = None
+
+        # Use the global optimization level defaults
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[opt_level]
+
+        # Verify that all pass_config values are None before defaults are applied
+        for pass_k in default_config["compilation_config"]["pass_config"]:
+            assert getattr(config.compilation_config.pass_config, pass_k) is None
+
+        # Verify that other config values are None before defaults are applied
+        for k in default_config["compilation_config"]:
+            if k != "pass_config":
+                assert getattr(config.compilation_config, k) is None
+
+
+@pytest.mark.parametrize(
+    ("model_id", "compiliation_config", "optimization_level"),
+    [
+        (
+            None,
+            CompilationConfig(backend="eager", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O0,
+        ),
+        (None, CompilationConfig(), OptimizationLevel.O0),
+        (None, CompilationConfig(), OptimizationLevel.O1),
+        (None, CompilationConfig(), OptimizationLevel.O2),
+        (None, CompilationConfig(), OptimizationLevel.O3),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(backend="inductor", custom_ops=["+quant_fp8"]),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O0,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O1,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O2,
+        ),
+        (
+            "RedHatAI/Qwen3-8B-speculator.eagle3",
+            CompilationConfig(),
+            OptimizationLevel.O3,
+        ),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O0),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O1),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O2),
+        ("RedHatAI/DeepSeek-V2.5-1210-FP8", CompilationConfig(), OptimizationLevel.O3),
+    ],
+)
+def test_vllm_config_defaults(model_id, compiliation_config, optimization_level):
+    """Test that optimization-level defaults are correctly applied."""
+
+    model_config = None
+    if model_id is not None:
+        model_config = ModelConfig(model_id)
+        vllm_config = VllmConfig(
+            model_config=model_config,
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    else:
+        vllm_config = VllmConfig(
+            compilation_config=compiliation_config,
+            optimization_level=optimization_level,
+        )
+    # Use the global optimization level defaults
+    default_config = OPTIMIZATION_LEVEL_TO_CONFIG[optimization_level]
+
+    # Verify pass_config defaults (nested under compilation_config)
+    pass_config_dict = default_config["compilation_config"]["pass_config"]
+    for pass_k, pass_v in pass_config_dict.items():
+        actual = getattr(vllm_config.compilation_config.pass_config, pass_k)
+        expected = pass_v(vllm_config) if callable(pass_v) else pass_v
+        assert actual == expected, (
+            f"pass_config.{pass_k}: expected {expected}, got {actual}"
+        )
+
+    # Verify other compilation_config defaults
+    compilation_config_dict = default_config["compilation_config"]
+    for k, v in compilation_config_dict.items():
+        if k != "pass_config":
+            actual = getattr(vllm_config.compilation_config, k)
+            expected = v(vllm_config) if callable(v) else v
+            assert actual == expected, (
+                f"compilation_config.{k}: expected {expected}, got {actual}"
+            )
+
+
+def test_vllm_config_callable_defaults():
+    """Test that callable defaults work in the config system.
+
+    Verifies that lambdas in default configs can inspect VllmConfig properties
+    (e.g., is_quantized, is_model_moe) to conditionally set optimization flags.
+    """
+    config_no_model = VllmConfig(optimization_level=OptimizationLevel.O2)
+
+    # Callable that checks if model exists
+    has_model = lambda cfg: cfg.model_config is not None
+    assert has_model(config_no_model) is False
+
+    # Test with quantized model
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    config_quantized = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_quantized = lambda cfg: (
+        cfg.model_config is not None and cfg.model_config.is_quantized()
+    )
+    assert enable_if_quantized(config_quantized) is True
+    assert enable_if_quantized(config_no_model) is False
+
+    # Test with MoE model
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    config_moe = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    enable_if_sequential = lambda cfg: (
+        cfg.model_config is not None and not cfg.model_config.is_model_moe()
+    )
+    assert enable_if_sequential(config_moe) is False
+    assert enable_if_sequential(config_quantized) is True
+
+
+def test_vllm_config_explicit_overrides():
+    """Test that explicit property overrides work correctly with callable defaults.
+
+    When users explicitly set configuration properties, those values
+    take precedence over callable defaults, across different models and
+    optimization levels.
+    """
+    from vllm.config.compilation import PassConfig
+
+    quantized_model = ModelConfig("RedHatAI/Llama-3.2-1B-FP8")
+    moe_model = ModelConfig("deepseek-ai/DeepSeek-V2-Lite")
+    regular_model = ModelConfig("Qwen/Qwen1.5-7B")
+
+    # Explicit compilation mode override on O0 (where default is NONE)
+    compilation_config = CompilationConfig(mode=CompilationMode.VLLM_COMPILE)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+
+    # Explicit pass config flags to override defaults
+    pass_config = PassConfig(enable_noop=True, enable_attn_fusion=True)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        optimization_level=OptimizationLevel.O0,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.pass_config.enable_noop is True
+    assert config.compilation_config.pass_config.enable_attn_fusion is True
+
+    # Explicit cudagraph mode override on quantized model at O2
+    pass_config = PassConfig(enable_async_tp=True)
+    compilation_config = CompilationConfig(
+        cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
+    )
+    config = VllmConfig(
+        model_config=quantized_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert config.compilation_config.pass_config.enable_async_tp is True
+    # Mode should still use default for O2
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+
+    # Different optimization levels with same model
+    config_o0 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O0
+    )
+    config_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    assert config_o0.compilation_config.mode == CompilationMode.NONE
+    assert config_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_o0.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
+    assert (
+        config_o2.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Same optimization level across different model types
+    config_moe_o2 = VllmConfig(
+        model_config=moe_model, optimization_level=OptimizationLevel.O2
+    )
+    config_regular_o2 = VllmConfig(
+        model_config=regular_model, optimization_level=OptimizationLevel.O2
+    )
+    config_quantized_o2 = VllmConfig(
+        model_config=quantized_model, optimization_level=OptimizationLevel.O2
+    )
+    # All should have same base compilation settings at O2
+    assert config_moe_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_regular_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config_quantized_o2.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert (
+        config_moe_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+    assert (
+        config_regular_o2.compilation_config.cudagraph_mode
+        == CUDAGraphMode.FULL_AND_PIECEWISE
+    )
+
+    # Override one field but not others
+    pass_config = PassConfig(enable_noop=False)
+    compilation_config = CompilationConfig(pass_config=pass_config)
+    config = VllmConfig(
+        model_config=regular_model,
+        optimization_level=OptimizationLevel.O2,
+        compilation_config=compilation_config,
+    )
+    # Explicit override should be respected
+    assert config.compilation_config.pass_config.enable_noop is False
+    # Other fields should still use defaults
+    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE
+    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE
diff --git a/tests/test_envs.py b/tests/test_envs.py
index 841d7945f912..6a9835a68e7e 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -36,7 +36,7 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch):
     # Enable envs cache and ignore ongoing environment changes
     enable_envs_cache()
 
-    # __getattr__ is not decorated with functools.cache
+    # __getattr__ is decorated with functools.cache
     assert hasattr(envs.__getattr__, "cache_info")
     start_hits = envs.__getattr__.cache_info().hits
 
diff --git a/tests/test_inputs.py b/tests/test_inputs.py
index 50a273016ab8..b1fb4e06a690 100644
--- a/tests/test_inputs.py
+++ b/tests/test_inputs.py
@@ -86,34 +86,6 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
         assert zipped["mm_processor_kwargs"] == exp_kwargs
 
 
-@pytest.mark.parametrize(
-    "model_id",
-    [
-        "facebook/opt-125m",
-    ],
-)
-@pytest.mark.parametrize(
-    "prompt",
-    [
-        {
-            "prompt": "",
-            "multi_modal_data": {"dummy": []},
-        },
-        {
-            "prompt_token_ids": [],
-            "multi_modal_data": {"dummy": []},
-        },
-    ],
-)
-def test_preprocessor_text_no_mm_inputs(model_id, prompt):
-    model_config = ModelConfig(model=model_id)
-    tokenizer = init_tokenizer_from_configs(model_config)
-    input_preprocessor = InputPreprocessor(model_config, tokenizer)
-
-    with pytest.raises(ValueError, match="does not support multimodal inputs"):
-        input_preprocessor.preprocess(prompt)
-
-
 @pytest.mark.parametrize(
     "model_id",
     [
@@ -127,6 +99,13 @@ def test_preprocessor_text_no_mm_inputs(model_id, prompt):
         {"prompt_token_ids": []},
     ],
 )
+@pytest.mark.skip(
+    reason=(
+        "Applying huggingface processor on text inputs results in "
+        "significant performance regression for multimodal models. "
+        "See https://github.com/vllm-project/vllm/issues/26320"
+    )
+)
 def test_preprocessor_always_mm_code_path(model_id, prompt):
     model_config = ModelConfig(model=model_id)
     tokenizer = init_tokenizer_from_configs(model_config)
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 01672358902f..8900e9c2a1e6 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -49,10 +49,13 @@ def test_trace_function_call():
     os.remove(path)
 
 
-def test_default_vllm_root_logger_configuration():
+def test_default_vllm_root_logger_configuration(monkeypatch):
     """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
     VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
     behavior is activated."""
+    monkeypatch.setenv("VLLM_LOGGING_COLOR", "0")
+    _configure_vllm_root_logger()
+
     logger = logging.getLogger("vllm")
     assert logger.level == logging.DEBUG
     assert not logger.propagate
@@ -70,12 +73,13 @@ def test_default_vllm_root_logger_configuration():
     assert formatter.datefmt == _DATE_FORMAT
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
-def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
+def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger(monkeypatch):
     """This test presumes that VLLM_CONFIGURE_LOGGING (default: True) and
     VLLM_LOGGING_CONFIG_PATH (default: None) are not configured and default
     behavior is activated."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+    monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False)
+
     root_logger = logging.getLogger("vllm")
     root_handler = root_logger.handlers[0]
 
@@ -99,49 +103,50 @@ def test_descendent_loggers_depend_on_and_propagate_logs_to_root_logger():
     assert log_record.levelno == logging.INFO
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
-@patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", None)
-def test_logger_configuring_can_be_disabled():
+def test_logger_configuring_can_be_disabled(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0")
+    monkeypatch.delenv("VLLM_LOGGING_CONFIG_PATH", raising=False)
 
     with patch("vllm.logger.dictConfig") as dict_config_mock:
         _configure_vllm_root_logger()
     dict_config_mock.assert_not_called()
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-@patch(
-    "vllm.logger.VLLM_LOGGING_CONFIG_PATH",
-    "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
-)
-def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist():
+def test_an_error_is_raised_when_custom_logging_config_file_does_not_exist(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+    monkeypatch.setenv(
+        "VLLM_LOGGING_CONFIG_PATH",
+        "/if/there/is/a/file/here/then/you/did/this/to/yourself.json",
+    )
+
     with pytest.raises(RuntimeError) as ex_info:
         _configure_vllm_root_logger()
     assert ex_info.type == RuntimeError  # noqa: E721
     assert "File does not exist" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
+def test_an_error_is_raised_when_custom_logging_config_is_invalid_json(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write("---\nloggers: []\nversion: 1")
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(JSONDecodeError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type == JSONDecodeError
-            assert "Expecting value" in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(JSONDecodeError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type == JSONDecodeError
+        assert "Expecting value" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
 @pytest.mark.parametrize(
     "unexpected_config",
     (
@@ -151,26 +156,30 @@ def test_an_error_is_raised_when_custom_logging_config_is_invalid_json():
     ),
 )
 def test_an_error_is_raised_when_custom_logging_config_is_unexpected_json(
+    monkeypatch,
     unexpected_config: Any,
 ):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however it fails before any change in behavior or
     configuration occurs."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(unexpected_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(ValueError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type == ValueError  # noqa: E721
-            assert "Invalid logging config. Expected dict, got" in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(ValueError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type == ValueError  # noqa: E721
+        assert "Invalid logging config. Expected dict, got" in str(ex_info)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 1)
-def test_custom_logging_config_is_parsed_and_used_when_provided():
+def test_custom_logging_config_is_parsed_and_used_when_provided(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "1")
+
     valid_logging_config = {
         "loggers": {
             "vllm.test_logger.logger": {
@@ -183,19 +192,18 @@ def test_custom_logging_config_is_parsed_and_used_when_provided():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with (
-            patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name),
-            patch("vllm.logger.dictConfig") as dict_config_mock,
-        ):
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with patch("vllm.logger.dictConfig") as dict_config_mock:
             _configure_vllm_root_logger()
             dict_config_mock.assert_called_with(valid_logging_config)
 
 
-@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
-def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
+def test_custom_logging_config_causes_an_error_if_configure_logging_is_off(monkeypatch):
     """This test calls _configure_vllm_root_logger again to test custom logging
     config behavior, however mocks are used to ensure no changes in behavior or
     configuration occur."""
+    monkeypatch.setenv("VLLM_CONFIGURE_LOGGING", "0")
+
     valid_logging_config = {
         "loggers": {
             "vllm.test_logger.logger": {
@@ -207,15 +215,15 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
     with NamedTemporaryFile(encoding="utf-8", mode="w") as logging_config_file:
         logging_config_file.write(json.dumps(valid_logging_config))
         logging_config_file.flush()
-        with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name):
-            with pytest.raises(RuntimeError) as ex_info:
-                _configure_vllm_root_logger()
-            assert ex_info.type is RuntimeError
-            expected_message_snippet = (
-                "VLLM_CONFIGURE_LOGGING evaluated to false, but "
-                "VLLM_LOGGING_CONFIG_PATH was given."
-            )
-            assert expected_message_snippet in str(ex_info)
+        monkeypatch.setenv("VLLM_LOGGING_CONFIG_PATH", logging_config_file.name)
+        with pytest.raises(RuntimeError) as ex_info:
+            _configure_vllm_root_logger()
+        assert ex_info.type is RuntimeError
+        expected_message_snippet = (
+            "VLLM_CONFIGURE_LOGGING evaluated to false, but "
+            "VLLM_LOGGING_CONFIG_PATH was given."
+        )
+        assert expected_message_snippet in str(ex_info)
 
         # Remember! The root logger is assumed to have been configured as
         # though VLLM_CONFIGURE_LOGGING=1 and VLLM_LOGGING_CONFIG_PATH=None.
diff --git a/tests/test_logprobs.py b/tests/test_logprobs.py
index d26a460d2bca..75e9d337aa24 100644
--- a/tests/test_logprobs.py
+++ b/tests/test_logprobs.py
@@ -2,8 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-import pytest
-
 from vllm.logprobs import (
     FlatLogprobs,
     Logprob,
@@ -14,24 +12,20 @@
 )
 
 
-def test_create_logprobs_non_flat(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
-
-    prompt_logprobs = create_prompt_logprobs()
+def test_create_logprobs_non_flat() -> None:
+    prompt_logprobs = create_prompt_logprobs(flat_logprobs=False)
     assert isinstance(prompt_logprobs, list)
     # Ensure first prompt position logprobs is None
     assert len(prompt_logprobs) == 1
     assert prompt_logprobs[0] is None
 
-    sample_logprobs = create_sample_logprobs()
+    sample_logprobs = create_sample_logprobs(flat_logprobs=False)
     assert isinstance(sample_logprobs, list)
     assert len(sample_logprobs) == 0
 
 
-def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
-
-    prompt_logprobs = create_prompt_logprobs()
+def test_create_logprobs_flat() -> None:
+    prompt_logprobs = create_prompt_logprobs(flat_logprobs=True)
     assert isinstance(prompt_logprobs, FlatLogprobs)
     assert prompt_logprobs.start_indices == [0]
     assert prompt_logprobs.end_indices == [0]
@@ -43,7 +37,7 @@ def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
     assert len(prompt_logprobs) == 1
     assert prompt_logprobs[0] == dict()
 
-    sample_logprobs = create_sample_logprobs()
+    sample_logprobs = create_sample_logprobs(flat_logprobs=True)
     assert isinstance(sample_logprobs, FlatLogprobs)
     assert len(sample_logprobs.start_indices) == 0
     assert len(sample_logprobs.end_indices) == 0
@@ -54,11 +48,8 @@ def test_create_logprobs_flat(monkeypatch: pytest.MonkeyPatch) -> None:
     assert len(sample_logprobs) == 0
 
 
-def test_append_logprobs_for_next_position_none_flat(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "0")
-    logprobs = create_sample_logprobs()
+def test_append_logprobs_for_next_position_none_flat() -> None:
+    logprobs = create_sample_logprobs(flat_logprobs=False)
     append_logprobs_for_next_position(
         logprobs,
         token_ids=[1],
@@ -85,11 +76,8 @@ def test_append_logprobs_for_next_position_none_flat(
     ]
 
 
-def test_append_logprobs_for_next_position_flat(
-    monkeypatch: pytest.MonkeyPatch,
-) -> None:
-    monkeypatch.setenv("VLLM_FLAT_LOGPROBS", "1")
-    logprobs = create_sample_logprobs()
+def test_append_logprobs_for_next_position_flat() -> None:
+    logprobs = create_sample_logprobs(flat_logprobs=True)
     append_logprobs_for_next_position(
         logprobs,
         token_ids=[1],
diff --git a/tests/test_routing_simulator.py b/tests/test_routing_simulator.py
index 5a162fa8f791..e8826eb441a2 100644
--- a/tests/test_routing_simulator.py
+++ b/tests/test_routing_simulator.py
@@ -9,9 +9,16 @@
 integration tests with FusedMoE layer.
 """
 
+import tempfile
+
 import pytest
 import torch
 
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.distributed import (
+    init_distributed_environment,
+    initialize_model_parallel,
+)
 from vllm.model_executor.layers.fused_moe.routing_simulator import (
     DistributionBasedRouting,
     RoutingSimulator,
@@ -89,6 +96,28 @@ def test_routing_strategy_integration(monkeypatch, device):
     # Test different routing strategies
     strategies = RoutingSimulator.get_available_strategies()
 
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        temp_file = tempfile.mkstemp()[1]
+        init_distributed_environment(
+            world_size=1,
+            rank=0,
+            local_rank=0,
+            distributed_init_method=f"file://{temp_file}",
+        )
+        initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+        )
+        fused_moe = FusedMoE(
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=0,
+            use_grouped_topk=False,
+            renormalize=True,
+        )
+
     for strategy in strategies:
         # Set environment variable
         env_name = "VLLM_MOE_ROUTING_SIMULATION_STRATEGY"
@@ -98,13 +127,9 @@ def test_routing_strategy_integration(monkeypatch, device):
         envs.environment_variables[env_name] = lambda s=strategy: s
 
         # Test the select_experts method
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = fused_moe.select_experts(
             hidden_states=hidden_states,
             router_logits=router_logits,
-            top_k=top_k,
-            use_grouped_topk=False,
-            renormalize=True,
-            indices_type=torch.long,
         )
 
         # Verify output shapes
diff --git a/tests/tokenization/test_do_lower_case.py b/tests/tokenization/test_do_lower_case.py
deleted file mode 100644
index 8aff50b351e3..000000000000
--- a/tests/tokenization/test_do_lower_case.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = ["BAAI/bge-base-en"]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-@pytest.mark.parametrize("n_tokens", [510])
-def test_special_tokens(tokenizer_name: str, n_tokens: int):
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
-
-    prompts = "[UNK]" * n_tokens
-    prompt_token_ids = tokenizer.encode(prompts)
-    assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_tokenizer.py b/tests/tokenization/test_tokenizer.py
deleted file mode 100644
index e86bb03883b5..000000000000
--- a/tests/tokenization/test_tokenizer.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-from transformers import PreTrainedTokenizerBase
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-
-TOKENIZER_NAMES = [
-    "facebook/opt-125m",
-    "gpt2",
-]
-
-
-@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
-def test_tokenizer_revision(tokenizer_name: str):
-    # Assume that "main" branch always exists
-    tokenizer = get_tokenizer(tokenizer_name, revision="main")
-    assert isinstance(tokenizer, PreTrainedTokenizerBase)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match="not a valid git identifier"):
-        get_tokenizer(tokenizer_name, revision="never")
diff --git a/tests/tokenization/test_tokenizer_registry.py b/tests/tokenization/test_tokenizer_registry.py
deleted file mode 100644
index d89737888aa2..000000000000
--- a/tests/tokenization/test_tokenizer_registry.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import TYPE_CHECKING, Any
-
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.transformers_utils.tokenizer_base import TokenizerBase, TokenizerRegistry
-
-if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
-
-
-class TestTokenizer(TokenizerBase):
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs) -> "TestTokenizer":
-        return TestTokenizer()
-
-    @property
-    def all_special_tokens_extended(self) -> list[str]:
-        raise NotImplementedError()
-
-    @property
-    def all_special_tokens(self) -> list[str]:
-        raise NotImplementedError()
-
-    @property
-    def all_special_ids(self) -> list[int]:
-        raise NotImplementedError()
-
-    @property
-    def bos_token_id(self) -> int:
-        return 0
-
-    @property
-    def eos_token_id(self) -> int:
-        return 1
-
-    @property
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def pad_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def is_fast(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    def vocab_size(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    def max_token_id(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    def truncation_side(self) -> str:
-        raise NotImplementedError()
-
-    def __call__(
-        self,
-        text: str | list[str] | list[int],
-        text_pair: str | None = None,
-        add_special_tokens: bool = False,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ):
-        raise NotImplementedError()
-
-    def get_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    def get_added_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    def encode(self, text: str, add_special_tokens: bool | None = None) -> list[int]:
-        raise NotImplementedError()
-
-    def apply_chat_template(
-        self,
-        messages: list["ChatCompletionMessageParam"],
-        tools: list[dict[str, Any]] | None = None,
-        **kwargs,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        raise NotImplementedError()
-
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
-        raise NotImplementedError()
-
-    def convert_ids_to_tokens(
-        self,
-        ids: list[int],
-        skip_special_tokens: bool = True,
-    ) -> list[str]:
-        raise NotImplementedError()
-
-
-def test_customized_tokenizer():
-    TokenizerRegistry.register(
-        "test_tokenizer", "tests.tokenization.test_tokenizer_registry", "TestTokenizer"
-    )
-
-    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer")
-    assert isinstance(tokenizer, TestTokenizer)
-    assert tokenizer.bos_token_id == 0
-    assert tokenizer.eos_token_id == 1
-
-    tokenizer = get_tokenizer("test_tokenizer", tokenizer_mode="custom")
-    assert isinstance(tokenizer, TestTokenizer)
-    assert tokenizer.bos_token_id == 0
-    assert tokenizer.eos_token_id == 1
diff --git a/tests/tokenizers_/__init__.py b/tests/tokenizers_/__init__.py
new file mode 100644
index 000000000000..a5d7f4b03103
--- /dev/null
+++ b/tests/tokenizers_/__init__.py
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# NOTE: Since CI runs the tests from the `tests` directory, it is necessary to rename
+# this module to avoid conflicting with HF's `tokenizers` package
diff --git a/tests/tokenizers_/test_basic.py b/tests/tokenizers_/test_basic.py
new file mode 100644
index 000000000000..1fca633cc5cd
--- /dev/null
+++ b/tests/tokenizers_/test_basic.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import _get_protocol_attrs  # type: ignore
+
+import pytest
+from transformers import PreTrainedTokenizerBase
+
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+def _get_missing_attrs(obj: object, target: type):
+    return [k for k in _get_protocol_attrs(target) if not hasattr(obj, k)]
+
+
+def test_tokenizer_like_protocol():
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer("gpt2", use_fast=False),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer("gpt2", use_fast=True),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+    assert not (
+        missing_attrs := _get_missing_attrs(
+            get_tokenizer(
+                "mistralai/Mistral-7B-Instruct-v0.3", tokenizer_mode="mistral"
+            ),
+            TokenizerLike,
+        )
+    ), f"Missing attrs: {missing_attrs}"
+
+
+@pytest.mark.parametrize("tokenizer_name", ["facebook/opt-125m", "gpt2"])
+def test_tokenizer_revision(tokenizer_name: str):
+    # Assume that "main" branch always exists
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+    assert isinstance(tokenizer, PreTrainedTokenizerBase)
+
+    # Assume that "never" branch always does not exist
+    with pytest.raises(OSError, match="not a valid git identifier"):
+        get_tokenizer(tokenizer_name, revision="never")
+
+
+@pytest.mark.parametrize("tokenizer_name", ["BAAI/bge-base-en"])
+@pytest.mark.parametrize("n_tokens", [510])
+def test_special_tokens(tokenizer_name: str, n_tokens: int):
+    tokenizer = get_tokenizer(tokenizer_name, revision="main")
+
+    prompts = "[UNK]" * n_tokens
+    prompt_token_ids = tokenizer.encode(prompts)
+    assert len(prompt_token_ids) == n_tokens + 2
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenizers_/test_detokenize.py
similarity index 99%
rename from tests/tokenization/test_detokenize.py
rename to tests/tokenizers_/test_detokenize.py
index f4b43a21daaa..ae1d6b095672 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenizers_/test_detokenize.py
@@ -8,7 +8,7 @@
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import (
     FastIncrementalDetokenizer,
diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenizers_/test_hf.py
similarity index 85%
rename from tests/tokenization/test_cached_tokenizer.py
rename to tests/tokenizers_/test_hf.py
index 074039f9e513..c1238900ce0d 100644
--- a/tests/tokenization/test_cached_tokenizer.py
+++ b/tests/tokenizers_/test_hf.py
@@ -6,7 +6,8 @@
 import pytest
 from transformers import AutoTokenizer
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_cached_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.hf import get_cached_tokenizer
 
 
 @pytest.mark.parametrize("model_id", ["gpt2", "zai-org/chatglm3-6b"])
@@ -25,13 +26,12 @@ def test_cached_tokenizer(model_id: str):
     _check_consistency(unpickled_tokenizer, reference_tokenizer)
 
 
-def _check_consistency(target: AnyTokenizer, expected: AnyTokenizer):
+def _check_consistency(target: TokenizerLike, expected: TokenizerLike):
     assert isinstance(target, type(expected))
 
     # Cached attributes
     assert target.all_special_ids == expected.all_special_ids
     assert target.all_special_tokens == expected.all_special_tokens
-    assert target.all_special_tokens_extended == expected.all_special_tokens_extended
     assert target.get_vocab() == expected.get_vocab()
     assert len(target) == len(expected)
 
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenizers_/test_mistral.py
similarity index 94%
rename from tests/tokenization/test_mistral_tokenizer.py
rename to tests/tokenizers_/test_mistral.py
index 1ada8ee187c3..92efac86dff2 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenizers_/test_mistral.py
@@ -7,7 +7,7 @@
 from mistral_common.exceptions import InvalidMessageStructureException
 from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
 
-from vllm.transformers_utils.tokenizers.mistral import (
+from vllm.tokenizers.mistral import (
     MistralTokenizer,
     _prepare_apply_chat_template_tools_and_messages,
 )
@@ -258,52 +258,46 @@ def mistral_tokenizer(request) -> MistralTokenizer:
 )
 class TestMistralTokenizer:
     def test_all_special_tokens(self, mistral_tokenizer: MistralTokenizer):
-        attributes = [
-            mistral_tokenizer.all_special_tokens,
-            mistral_tokenizer.all_special_tokens_extended,
-        ]
-
-        for attribute in attributes:
-            if mistral_tokenizer.is_tekken:
-                assert attribute == [
-                    "<unk>",
-                    "<s>",
-                    "</s>",
-                    "[INST]",
-                    "[/INST]",
-                    "[AVAILABLE_TOOLS]",
-                    "[/AVAILABLE_TOOLS]",
-                    "[TOOL_RESULTS]",
-                    "[/TOOL_RESULTS]",
-                    "[TOOL_CALLS]",
-                    "[IMG]",
-                    "<pad>",
-                    "[IMG_BREAK]",
-                    "[IMG_END]",
-                    "[PREFIX]",
-                    "[MIDDLE]",
-                    "[SUFFIX]",
-                    "[SYSTEM_PROMPT]",
-                    "[/SYSTEM_PROMPT]",
-                    "[TOOL_CONTENT]",
-                ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
-                    "[ARGS]",
-                    "[CALL_ID]",
-                    "[THINK]",
-                    "[/THINK]",
-                ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
-            else:
-                assert attribute == [
-                    "<s>",
-                    "</s>",
-                    "[INST]",
-                    "[/INST]",
-                    "[TOOL_CALLS]",
-                    "[AVAILABLE_TOOLS]",
-                    "[/AVAILABLE_TOOLS]",
-                    "[TOOL_RESULTS]",
-                    "[/TOOL_RESULTS]",
-                ] + [f"[control_{i}]" for i in range(8, 769)]
+        if mistral_tokenizer.is_tekken:
+            assert mistral_tokenizer.all_special_tokens == [
+                "<unk>",
+                "<s>",
+                "</s>",
+                "[INST]",
+                "[/INST]",
+                "[AVAILABLE_TOOLS]",
+                "[/AVAILABLE_TOOLS]",
+                "[TOOL_RESULTS]",
+                "[/TOOL_RESULTS]",
+                "[TOOL_CALLS]",
+                "[IMG]",
+                "<pad>",
+                "[IMG_BREAK]",
+                "[IMG_END]",
+                "[PREFIX]",
+                "[MIDDLE]",
+                "[SUFFIX]",
+                "[SYSTEM_PROMPT]",
+                "[/SYSTEM_PROMPT]",
+                "[TOOL_CONTENT]",
+            ] + [f"<SPECIAL_{i}>" for i in range(20, 32)] + [
+                "[ARGS]",
+                "[CALL_ID]",
+                "[THINK]",
+                "[/THINK]",
+            ] + [f"<SPECIAL_{i}>" for i in range(36, 1000)]
+        else:
+            assert mistral_tokenizer.all_special_tokens == [
+                "<s>",
+                "</s>",
+                "[INST]",
+                "[/INST]",
+                "[TOOL_CALLS]",
+                "[AVAILABLE_TOOLS]",
+                "[/AVAILABLE_TOOLS]",
+                "[TOOL_RESULTS]",
+                "[/TOOL_RESULTS]",
+            ] + [f"[control_{i}]" for i in range(8, 769)]
 
     def get_vocab(self, mistral_tokenizer: MistralTokenizer):
         assert (
@@ -314,24 +308,6 @@ def get_vocab(self, mistral_tokenizer: MistralTokenizer):
     def test_get_added_vocab(self, mistral_tokenizer: MistralTokenizer):
         assert mistral_tokenizer.get_added_vocab() == {}
 
-    def test_encode_one(self, mistral_tokenizer: MistralTokenizer):
-        token_ids = (
-            [22177, 4304, 2662] if mistral_tokenizer.is_tekken else [23325, 2294, 1686]
-        )
-
-        assert mistral_tokenizer.encode_one("Hello world !") == token_ids
-        assert mistral_tokenizer.encode_one("Hello world !", max_length=1) == token_ids
-        assert (
-            mistral_tokenizer.encode_one("Hello world !", truncation=True, max_length=1)
-            == token_ids[:-2]
-        )
-        assert (
-            mistral_tokenizer.encode_one(
-                "Hello world !", truncation=False, max_length=1
-            )
-            == token_ids
-        )
-
     def test_encode(self, mistral_tokenizer: MistralTokenizer):
         token_ids = (
             [1, 22177, 4304, 2662]
@@ -370,6 +346,51 @@ def test_encode(self, mistral_tokenizer: MistralTokenizer):
             mistral_tokenizer.encode("Hello world !", add_special_tokens=False)
             == token_ids[1:]
         )
+        assert mistral_tokenizer.encode("", add_special_tokens=False) == []
+
+    def test_call(self, mistral_tokenizer: MistralTokenizer):
+        token_ids = (
+            [1, 22177, 4304, 2662]
+            if mistral_tokenizer.is_tekken
+            else [1, 23325, 2294, 1686]
+        )
+        attn_mask = [1 for _ in range(len(token_ids))]
+
+        # Test 1: no special tokens
+        assert mistral_tokenizer("Hello world !", add_special_tokens=False) == {
+            "attention_mask": attn_mask[1:],
+            "input_ids": token_ids[1:],
+        }
+        # Test 2: special tokens
+        assert mistral_tokenizer("Hello world !", add_special_tokens=True) == {
+            "attention_mask": attn_mask,
+            "input_ids": token_ids,
+        }
+        # Test 3: special tokens + truncation
+        assert mistral_tokenizer(
+            "Hello world !", add_special_tokens=True, truncation=True, max_length=3
+        ) == {
+            "attention_mask": attn_mask[:-1],
+            "input_ids": token_ids[:-1],
+        }
+        # Test 4: special tokens + no truncation + max length
+        assert mistral_tokenizer(
+            "Hello world !", add_special_tokens=True, max_length=3
+        ) == {
+            "attention_mask": attn_mask,
+            "input_ids": token_ids,
+        }
+        # Test 5: empty string
+        assert mistral_tokenizer("", add_special_tokens=False) == {
+            "attention_mask": [],
+            "input_ids": [],
+        }
+
+        with pytest.raises(
+            ValueError,
+            match=(r"`text_pair` is not supported by `MistralTokenizer.__call__`."),
+        ):
+            mistral_tokenizer("Hello world !", "invalid pair")
 
     @pytest.mark.parametrize(
         "openai_request,add_generation_prompt,continue_final_message,expected_output,decoded_expected_output",
@@ -1087,6 +1108,24 @@ def test_decode(
             )
             == expected_tokens[mistral_tokenizer.is_tekken]
         )
+        assert (
+            mistral_tokenizer.decode(
+                ids[mistral_tokenizer.is_tekken],
+                skip_special_tokens=skip_special_tokens,
+            )
+            == expected_tokens[mistral_tokenizer.is_tekken]
+        )
+
+    def test_decode_empty(
+        self,
+        mistral_tokenizer: MistralTokenizer,
+    ):
+        assert (
+            mistral_tokenizer.decode(
+                [],
+            )
+            == ""
+        )
 
     def test_decode_int(
         self,
@@ -1390,6 +1429,8 @@ def test_convert_tokens_to_string(self, mistral_tokenizer: MistralTokenizer):
             == expected_strings[mistral_tokenizer.is_tekken]
         )
 
+        assert mistral_tokenizer.convert_tokens_to_string([]) == ""
+
     @pytest.mark.parametrize(
         "skip_special_tokens,tuple_expected_tokens",
         (
@@ -2220,3 +2261,5 @@ def test_convert_ids_to_tokens(
             ids, skip_special_tokens=skip_special_tokens
         )
         assert actual_tokens == expected_tokens
+
+        assert mistral_tokenizer.convert_ids_to_tokens([]) == []
diff --git a/tests/tokenizers_/test_registry.py b/tests/tokenizers_/test_registry.py
new file mode 100644
index 000000000000..57b6a14a54b3
--- /dev/null
+++ b/tests/tokenizers_/test_registry.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+
+from vllm.tokenizers import TokenizerLike, TokenizerRegistry
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+
+class TestTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TestTokenizer":
+        return TestTokenizer(path_or_repo_id)  # type: ignore
+
+    def __init__(self, path_or_repo_id: str | Path) -> None:
+        super().__init__()
+
+        self.path_or_repo_id = path_or_repo_id
+
+    @property
+    def bos_token_id(self) -> int:
+        return 0
+
+    @property
+    def eos_token_id(self) -> int:
+        return 1
+
+    @property
+    def pad_token_id(self) -> int:
+        return 2
+
+    @property
+    def is_fast(self) -> bool:
+        return True
+
+
+def test_customized_tokenizer():
+    TokenizerRegistry.register("test_tokenizer", __name__, TestTokenizer.__name__)
+
+    tokenizer = TokenizerRegistry.get_tokenizer("test_tokenizer", "abc")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.path_or_repo_id == "abc"
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
+    assert tokenizer.pad_token_id == 2
+
+    tokenizer = get_tokenizer("abc", tokenizer_mode="test_tokenizer")
+    assert isinstance(tokenizer, TestTokenizer)
+    assert tokenizer.path_or_repo_id == "abc"
+    assert tokenizer.bos_token_id == 0
+    assert tokenizer.eos_token_id == 1
+    assert tokenizer.pad_token_id == 2
diff --git a/tests/tool_use/test_ernie45_moe_tool_parser.py b/tests/tool_use/test_ernie45_moe_tool_parser.py
index 36a07bb561d9..8fbbbba32538 100644
--- a/tests/tool_use/test_ernie45_moe_tool_parser.py
+++ b/tests/tool_use/test_ernie45_moe_tool_parser.py
@@ -14,8 +14,9 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser import Ernie45ToolParser
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # Use a common model that is likely to be available
 MODEL = "baidu/ERNIE-4.5-21B-A3B-Thinking"
@@ -173,7 +174,7 @@ def test_extract_tool_calls(
 
 def stream_delta_message_generator(
     ernie45_tool_parser: Ernie45ToolParser,
-    ernie45_tokenizer: AnyTokenizer,
+    ernie45_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py
index 9eb73b80fa9b..c7ca024f3a76 100644
--- a/tests/tool_use/test_jamba_tool_parser.py
+++ b/tests/tool_use/test_jamba_tool_parser.py
@@ -10,8 +10,9 @@
 
 from vllm.entrypoints.openai.protocol import DeltaMessage, FunctionCall, ToolCall
 from vllm.entrypoints.openai.tool_parsers.jamba_tool_parser import JambaToolParser
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -44,7 +45,9 @@ def assert_tool_calls(
 
 
 def stream_delta_message_generator(
-    jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, model_output: str
+    jamba_tool_parser: JambaToolParser,
+    jamba_tokenizer: TokenizerLike,
+    model_output: str,
 ) -> Generator[DeltaMessage, None, None]:
     all_token_ids = jamba_tokenizer.encode(model_output, add_special_tokens=False)
 
diff --git a/tests/tool_use/test_kimi_k2_tool_parser.py b/tests/tool_use/test_kimi_k2_tool_parser.py
index c358589dbc29..3a48b5206141 100644
--- a/tests/tool_use/test_kimi_k2_tool_parser.py
+++ b/tests/tool_use/test_kimi_k2_tool_parser.py
@@ -60,6 +60,11 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
     ids=[
         "tool_call_with_content_before",
         "multi_tool_call_with_content_before",
+        "concatenated_tool_calls_bug_fix",
+        "three_concatenated_tool_calls",
+        "mixed_spacing_tool_calls",
+        "angle_brackets_in_json",
+        "newlines_in_json",
     ],
     argnames=["model_output", "expected_tool_calls", "expected_content"],
     argvalues=[
@@ -114,6 +119,123 @@ def test_extract_tool_calls_no_tools(kimi_k2_tool_parser):
             ],
             "I'll help you check the weather. ",
         ),
+        (
+            """I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"latitude": 34.0522, "longitude": -118.2437}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"content": "Los Angeles today"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps(
+                            {"latitude": 34.0522, "longitude": -118.2437}
+                        ),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"content": "Los Angeles today"}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll get the weather and news for LA today. First, let me get the weather using Los Angeles coordinates, and then get the latest news. ",
+        ),
+        (
+            """I'll help you with multiple tasks. <|tool_calls_section_begin|><|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"city": "New York"}<|tool_call_end|><|tool_call_begin|>functions.get_news:1<|tool_call_argument_begin|>{"topic": "technology"}<|tool_call_end|><|tool_call_begin|>functions.send_email:2<|tool_call_argument_begin|>{"to": "user@example.com", "subject": "Daily Update"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.get_weather:0",
+                    function=FunctionCall(
+                        name="get_weather",
+                        arguments=json.dumps({"city": "New York"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.get_news:1",
+                    function=FunctionCall(
+                        name="get_news",
+                        arguments=json.dumps({"topic": "technology"}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.send_email:2",
+                    function=FunctionCall(
+                        name="send_email",
+                        arguments=json.dumps(
+                            {"to": "user@example.com", "subject": "Daily Update"}
+                        ),
+                    ),
+                    type="function",
+                ),
+            ],
+            "I'll help you with multiple tasks. ",
+        ),
+        (
+            """Mixed spacing test. <|tool_calls_section_begin|> <|tool_call_begin|> functions.test:0 <|tool_call_argument_begin|> {} <|tool_call_end|><|tool_call_begin|>functions.test2:1<|tool_call_argument_begin|>{}<|tool_call_end|> <|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.test:0",
+                    function=FunctionCall(
+                        name="test",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+                ToolCall(
+                    id="functions.test2:1",
+                    function=FunctionCall(
+                        name="test2",
+                        arguments=json.dumps({}),
+                    ),
+                    type="function",
+                ),
+            ],
+            "Mixed spacing test. ",
+        ),
+        (
+            """I need to process HTML content. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_html:0<|tool_call_argument_begin|>{"html": "<div>content</div>", "text": "normal text"}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_html:0",
+                    function=FunctionCall(
+                        name="process_html",
+                        arguments=json.dumps(
+                            {"html": "<div>content</div>", "text": "normal text"}
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process HTML content. ",
+        ),
+        (
+            """I need to process formatted JSON. <|tool_calls_section_begin|><|tool_call_begin|>functions.process_data:0<|tool_call_argument_begin|>{
+  "name": "test",
+  "value": 123,
+  "nested": {
+    "key": "value"
+  }
+}<|tool_call_end|><|tool_calls_section_end|>""",
+            [
+                ToolCall(
+                    id="functions.process_data:0",
+                    function=FunctionCall(
+                        name="process_data",
+                        arguments=json.dumps(
+                            {"name": "test", "value": 123, "nested": {"key": "value"}},
+                            indent=2,
+                        ),
+                    ),
+                    type="function",
+                )
+            ],
+            "I need to process formatted JSON. ",
+        ),
     ],
 )
 def test_extract_tool_calls(
@@ -209,3 +331,596 @@ def test_streaming_no_tool_calls(kimi_k2_tool_parser):
     assert result is not None
     assert hasattr(result, "content")
     assert result.content == " without any tool calls."
+
+
+def test_token_leak_between_section_and_tool_begin(kimi_k2_tool_parser):
+    """
+    Test that text between <|tool_calls_section_begin|> and <|tool_call_begin|>
+    is suppressed and does not leak into reasoning_delta.
+    This is the main vulnerability being fixed.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Get token IDs for the markers
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+    tool_call_begin_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Simulate streaming sequence:
+    # Delta 1: "I'll help you with that. "
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="I'll help you with that. ",
+        delta_text="I'll help you with that. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2, 3],  # Regular tokens
+        delta_token_ids=[1, 2, 3],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "I'll help you with that. "
+
+    # Delta 2: "<|tool_calls_section_begin|>"
+    prev_ids = [1, 2, 3]
+    curr_ids = prev_ids + [section_begin_token_id]
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[section_begin_token_id],
+        request=None,
+    )
+    # Section marker should be stripped and suppressed
+    assert result2 is None or (result2.content is None or result2.content == "")
+
+    # Delta 3: " spurious text or tokens " (THE LEAK SCENARIO)
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [4, 5]
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|>",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        delta_text=" spurious text ",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[4, 5],
+        request=None,
+    )
+    # CRITICAL: This text should be suppressed, NOT returned as reasoning_delta
+    assert result3 is None or (result3.content is None or result3.content == "")
+
+    # Delta 4: "<|tool_call_begin|>..."
+    prev_ids = curr_ids
+    curr_ids = curr_ids + [tool_call_begin_token_id]
+    _result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="I'll help you with that. <|tool_calls_section_begin|> spurious text ",
+        current_text="I'll help you with that. <|tool_calls_section_begin|> spurious text <|tool_call_begin|>",
+        delta_text="<|tool_call_begin|>",
+        previous_token_ids=prev_ids,
+        current_token_ids=curr_ids,
+        delta_token_ids=[tool_call_begin_token_id],
+        request=None,
+    )
+    # Now we're in tool call mode, result depends on internal state
+    # The key is that the spurious text from Delta 3 was not leaked
+
+
+def test_split_markers_across_deltas(kimi_k2_tool_parser):
+    """
+    Test that markers split across delta chunks are correctly detected
+    via the rolling buffer mechanism.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_token_id = kimi_k2_tool_parser.vocab.get(
+        "<|tool_calls_section_begin|>"
+    )
+
+    # Delta 1: "...reasoning<|tool_calls_sec"
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning",
+        current_text="Some reasoning<|tool_calls_sec",
+        delta_text="<|tool_calls_sec",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, 3],  # Partial token
+        delta_token_ids=[3],
+        request=None,
+    )
+    # Partial token not recognized yet, might be buffered
+    # Should return as content or None (depends on implementation)
+
+    # Delta 2: "tion_begin|> "  (completes the marker)
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning<|tool_calls_sec",
+        current_text="Some reasoning<|tool_calls_section_begin|> ",
+        delta_text="tion_begin|> ",
+        previous_token_ids=[1, 2, 3],
+        current_token_ids=[1, 2, section_begin_token_id, 4],
+        delta_token_ids=[section_begin_token_id, 4],
+        request=None,
+    )
+    # Now the complete marker should be detected via buffer
+    # The parser should enter tool section mode
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_marker_variants(kimi_k2_tool_parser):
+    """Test that both singular and plural marker variants are recognized."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Test singular variant: <|tool_call_section_begin|> (note: singular "call")
+    singular_token_id = kimi_k2_tool_parser.vocab.get("<|tool_call_section_begin|>")
+
+    if singular_token_id is not None:  # Only test if tokenizer supports it
+        _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+            previous_text="Reasoning ",
+            current_text="Reasoning <|tool_call_section_begin|>",
+            delta_text="<|tool_call_section_begin|>",
+            previous_token_ids=[1, 2],
+            current_token_ids=[1, 2, singular_token_id],
+            delta_token_ids=[singular_token_id],
+            request=None,
+        )
+        # Should enter tool section mode with singular variant too
+        assert kimi_k2_tool_parser.in_tool_section is True
+
+
+def test_reentry_to_reasoning_after_tool_section(kimi_k2_tool_parser):
+    """
+    Test that after exiting a tool section with <|tool_calls_section_end|>,
+    subsequent text is correctly returned as reasoning content.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Exit tool section
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Subsequent reasoning text should be returned normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|><|tool_calls_section_end|>",
+        current_text="<|tool_calls_section_begin|><|tool_calls_section_end|> More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[section_begin_id, section_end_id],
+        current_token_ids=[section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == " More reasoning"
+
+
+def test_empty_tool_section(kimi_k2_tool_parser):
+    """Test an empty tool section (begin immediately followed by end)."""
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Section begin
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+
+    # Immediate section end
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning <|tool_calls_section_begin|>",
+        current_text="Reasoning <|tool_calls_section_begin|><|tool_calls_section_end|>",
+        delta_text="<|tool_calls_section_end|>",
+        previous_token_ids=[1, section_begin_id],
+        current_token_ids=[1, section_begin_id, section_end_id],
+        delta_token_ids=[section_end_id],
+        request=None,
+    )
+    # Should exit cleanly without errors
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+
+def test_malformed_tool_section_recovery(kimi_k2_tool_parser):
+    """
+    Test that the parser recovers from a malformed tool section
+    that never closes properly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Simulate a lot of text without proper tool calls or section end
+    # This should trigger the error recovery mechanism
+    large_text = "x" * 10000  # Exceeds max_section_chars
+
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|>" + large_text,
+        delta_text=large_text,
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id] + list(range(100, 100 + len(large_text))),
+        delta_token_ids=list(range(100, 100 + len(large_text))),
+        request=None,
+    )
+
+    # Parser should have force-exited the tool section
+    assert kimi_k2_tool_parser.in_tool_section is False
+    # And returned the content as reasoning
+    assert result2 is not None
+    assert result2.content == large_text
+
+
+def test_state_reset(kimi_k2_tool_parser):
+    """Test that reset_streaming_state() properly clears all state."""
+    # Put parser in a complex state
+    kimi_k2_tool_parser.in_tool_section = True
+    kimi_k2_tool_parser.token_buffer = "some buffer"
+    kimi_k2_tool_parser.current_tool_id = 5
+    kimi_k2_tool_parser.prev_tool_call_arr = [{"id": "test"}]
+    kimi_k2_tool_parser.section_char_count = 1000
+
+    # Reset
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify all state is cleared
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+    assert kimi_k2_tool_parser.current_tool_id == -1
+    assert kimi_k2_tool_parser.prev_tool_call_arr == []
+    assert kimi_k2_tool_parser.section_char_count == 0
+    assert kimi_k2_tool_parser.current_tool_name_sent is False
+    assert kimi_k2_tool_parser.streamed_args_for_tool == []
+
+
+def test_section_begin_noise_tool_begin_same_chunk(kimi_k2_tool_parser):
+    """
+    Test that begin→noise→tool_begin within the SAME chunk suppresses
+    the noise text correctly (not just across chunks).
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    tool_call_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+
+    # Single delta containing: section_begin + spurious text + tool_call_begin
+    combined_text = "<|tool_calls_section_begin|> noise text <|tool_call_begin|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_text,
+        delta_text=combined_text,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, 3, 4, tool_call_begin_id],
+        delta_token_ids=[section_begin_id, 3, 4, tool_call_begin_id],
+        request=None,
+    )
+
+    # The noise text should NOT leak into content
+    # Result should either be None/empty or start tool call parsing
+    if result is not None and result.content is not None:
+        # If content is returned, it should not contain the noise
+        assert "noise text" not in result.content
+        assert result.content == "" or result.content.strip() == ""
+
+
+def test_stream_ends_without_section_end_marker(kimi_k2_tool_parser):
+    """
+    Test that if the stream ends (EOF) without a proper section end marker,
+    the parser doesn't leak text, doesn't crash, and resets state cleanly.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+
+    # Enter tool section
+    _result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="<|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[],
+        current_token_ids=[section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # Some content in tool section
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="<|tool_calls_section_begin|>",
+        current_text="<|tool_calls_section_begin|> partial content",
+        delta_text=" partial content",
+        previous_token_ids=[section_begin_id],
+        current_token_ids=[section_begin_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+    # Content should be suppressed
+    assert result2.content == "" or result2.content is None
+
+    # Stream ends (EOF) - no more deltas, no section_end marker
+    # Simulate this by manually checking state and resetting
+    # (In real usage, the request handler would call reset_streaming_state)
+    assert kimi_k2_tool_parser.in_tool_section is True  # Still in section
+
+    # Reset state (as would happen between requests)
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    # Verify clean slate
+    assert kimi_k2_tool_parser.in_tool_section is False
+    assert kimi_k2_tool_parser.token_buffer == ""
+
+    # Next request should work normally
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="New reasoning",
+        delta_text="New reasoning",
+        previous_token_ids=[],
+        current_token_ids=[20, 21],
+        delta_token_ids=[20, 21],
+        request=None,
+    )
+    assert result3 is not None
+    assert result3.content == "New reasoning"
+
+
+def test_same_chunk_begin_and_end_markers(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST: Verify that when both section_begin and section_end
+    markers appear in the SAME chunk, the parser correctly:
+    1. Enters the tool section
+    2. Immediately exits the tool section
+    3. Does NOT get stuck in in_tool_section=True state
+
+    This tests the bug fix where elif was changed to if to handle
+    both state transitions in a single delta.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Single chunk with both markers (e.g., empty tool section)
+    combined_delta = "<|tool_calls_section_begin|><|tool_calls_section_end|>"
+
+    result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning ",
+        current_text="Some reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id, section_end_id],
+        delta_token_ids=[section_begin_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should NOT be stuck in tool section
+    assert kimi_k2_tool_parser.in_tool_section is False, (
+        "Parser stuck in tool section after processing both begin/end in same chunk. "
+        "This indicates the elif bug was not fixed."
+    )
+
+    # Result should be empty or contain only stripped content
+    assert result is not None
+    assert result.content == "" or result.content is None
+
+    # Verify subsequent content streams correctly (not suppressed)
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Some reasoning " + combined_delta,
+        current_text="Some reasoning " + combined_delta + " More reasoning",
+        delta_text=" More reasoning",
+        previous_token_ids=[1, 2, section_begin_id, section_end_id],
+        current_token_ids=[1, 2, section_begin_id, section_end_id, 10, 11],
+        delta_token_ids=[10, 11],
+        request=None,
+    )
+
+    # This content should NOT be suppressed (we're out of tool section)
+    assert result2 is not None
+    assert result2.content == " More reasoning"
+
+
+def test_same_chunk_begin_content_end_markers(kimi_k2_tool_parser):
+    """
+    Test the same-chunk scenario with actual content between markers.
+    Example: <|tool_calls_section_begin|> text <|tool_calls_section_end|>
+    all arriving in one delta. The key is that the state machine correctly
+    transitions in and out within the same chunk.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+
+    # Chunk with begin, some whitespace/noise, and end all together
+    # This simulates a tool section that opens and closes in the same chunk
+    combined_delta = "<|tool_calls_section_begin|>   <|tool_calls_section_end|>"
+
+    _result = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning ",
+        current_text="Reasoning " + combined_delta,
+        delta_text=combined_delta,
+        previous_token_ids=[1],
+        current_token_ids=[1, section_begin_id, 100, section_end_id],
+        delta_token_ids=[section_begin_id, 100, section_end_id],
+        request=None,
+    )
+
+    # Parser should exit cleanly (not stuck in tool section)
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Verify the fix: next content should stream normally, not be suppressed
+    result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Reasoning " + combined_delta,
+        current_text="Reasoning " + combined_delta + " Done",
+        delta_text=" Done",
+        previous_token_ids=[1, section_begin_id, 100, section_end_id],
+        current_token_ids=[1, section_begin_id, 100, section_end_id, 200],
+        delta_token_ids=[200],
+        request=None,
+    )
+
+    # Content after section should be returned (not suppressed)
+    assert result2 is not None
+    assert result2.content == " Done"
+
+
+def test_tool_call_end_and_section_end_same_chunk(kimi_k2_tool_parser):
+    """
+    CRITICAL TEST (P1): Verify that when both <|tool_call_end|> and
+    <|tool_calls_section_end|> appear in the SAME chunk, the parser:
+    1. Processes the tool_call_end first (emits final arguments)
+    2. THEN exits the section
+    3. Does NOT drop the final tool call update
+    4. Does NOT leak special tokens into reasoning
+
+    This tests the deferred section exit fix.
+    """
+    kimi_k2_tool_parser.reset_streaming_state()
+
+    section_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_begin|>")
+    section_end_id = kimi_k2_tool_parser.vocab.get("<|tool_calls_section_end|>")
+    tool_begin_id = kimi_k2_tool_parser.vocab.get("<|tool_call_begin|>")
+    tool_end_id = kimi_k2_tool_parser.vocab.get("<|tool_call_end|>")
+
+    # Simulate a streaming sequence for a SHORT tool call (all in one chunk):
+    # 1. Reasoning text
+    result1 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="",
+        current_text="Let me help. ",
+        delta_text="Let me help. ",
+        previous_token_ids=[],
+        current_token_ids=[1, 2],
+        delta_token_ids=[1, 2],
+        request=None,
+    )
+    assert result1 is not None
+    assert result1.content == "Let me help. "
+
+    # 2. Section begin
+    _result2 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text="Let me help. ",
+        current_text="Let me help. <|tool_calls_section_begin|>",
+        delta_text="<|tool_calls_section_begin|>",
+        previous_token_ids=[1, 2],
+        current_token_ids=[1, 2, section_begin_id],
+        delta_token_ids=[section_begin_id],
+        request=None,
+    )
+    assert kimi_k2_tool_parser.in_tool_section is True
+
+    # 3. Tool call begin + full content + tool_end + section_end ALL IN ONE CHUNK
+    # This is the critical scenario for short tool calls
+    combined = (
+        '<|tool_call_begin|>get_weather:0 <|tool_call_argument_begin|> {"city": "Paris"} '
+        "<|tool_call_end|><|tool_calls_section_end|>"
+    )
+
+    # Build up the previous text gradually to simulate realistic streaming
+    prev_text = "Let me help. <|tool_calls_section_begin|>"
+    curr_text = prev_text + combined
+
+    result3 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=prev_text,
+        current_text=curr_text,
+        delta_text=combined,
+        previous_token_ids=[1, 2, section_begin_id],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        delta_token_ids=[tool_begin_id, 10, 11, 12, tool_end_id, section_end_id],
+        request=None,
+    )
+
+    # CRITICAL: Parser should have exited section AFTER processing tool
+    assert kimi_k2_tool_parser.in_tool_section is False
+
+    # Tool call should have been emitted (not dropped)
+    # The result might be the tool name or None depending on state, but
+    # importantly, it shouldn't be returning the literal tokens as content
+
+    if result3 is not None and result3.content is not None:
+        # Verify no special tokens leaked into content
+        assert "<|tool_call_end|>" not in result3.content
+        assert "<|tool_calls_section_end|>" not in result3.content
+
+    # 4. Verify subsequent content streams normally
+    result4 = kimi_k2_tool_parser.extract_tool_calls_streaming(
+        previous_text=curr_text,
+        current_text=curr_text + " Done",
+        delta_text=" Done",
+        previous_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+        ],
+        current_token_ids=[
+            1,
+            2,
+            section_begin_id,
+            tool_begin_id,
+            10,
+            11,
+            12,
+            tool_end_id,
+            section_end_id,
+            20,
+        ],
+        delta_token_ids=[20],
+        request=None,
+    )
+
+    # Content after tool section should stream normally
+    assert result4 is not None
+    assert result4.content == " Done"
diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py
index 9af94a6a64a2..77084ec2d945 100644
--- a/tests/tool_use/test_parallel_tool_calls.py
+++ b/tests/tool_use/test_parallel_tool_calls.py
@@ -212,3 +212,60 @@ async def test_parallel_tool_calls_with_results(
     assert finish_reason_count == 1
     assert len(chunks)
     assert "".join(chunks) == choice.message.content
+
+
+@pytest.mark.asyncio
+async def test_parallel_tool_calls_false(client: openai.AsyncOpenAI):
+    """
+    Ensure only one tool call is returned when parallel_tool_calls is False.
+    """
+
+    models = await client.models.list()
+    model_name: str = models.data[0].id
+    chat_completion = await client.chat.completions.create(
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        model=model_name,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+    )
+
+    stop_reason = chat_completion.choices[0].finish_reason
+    non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls
+
+    # make sure only 1 tool call is present
+    assert len(non_streamed_tool_calls) == 1
+    assert stop_reason == "tool_calls"
+
+    # make the same request, streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS,
+        temperature=0,
+        max_completion_tokens=200,
+        tools=[WEATHER_TOOL, SEARCH_TOOL],
+        logprobs=False,
+        parallel_tool_calls=False,
+        stream=True,
+    )
+
+    finish_reason_count: int = 0
+    tool_call_id_count: int = 0
+
+    async for chunk in stream:
+        # if there's a finish reason make sure it's tools
+        if chunk.choices[0].finish_reason:
+            finish_reason_count += 1
+            assert chunk.choices[0].finish_reason == "tool_calls"
+
+        streamed_tool_calls = chunk.choices[0].delta.tool_calls
+        if streamed_tool_calls and len(streamed_tool_calls) > 0:
+            tool_call = streamed_tool_calls[0]
+            if tool_call.id:
+                tool_call_id_count += 1
+
+    # make sure only 1 streaming tool call is present
+    assert tool_call_id_count == 1
+    assert finish_reason_count == 1
diff --git a/tests/tool_use/test_qwen3coder_tool_parser.py b/tests/tool_use/test_qwen3coder_tool_parser.py
index 93ef1049fc07..864bb0d0c06c 100644
--- a/tests/tool_use/test_qwen3coder_tool_parser.py
+++ b/tests/tool_use/test_qwen3coder_tool_parser.py
@@ -17,8 +17,9 @@
     Qwen3CoderToolParser,
 )
 from vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser import Qwen3XMLToolParser
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -104,7 +105,7 @@ def assert_tool_calls(
 
 def stream_delta_message_generator(
     qwen3_tool_parser,
-    qwen3_tokenizer: AnyTokenizer,
+    qwen3_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_seed_oss_tool_parser.py b/tests/tool_use/test_seed_oss_tool_parser.py
index 1367ad87cb01..d94df61128c9 100644
--- a/tests/tool_use/test_seed_oss_tool_parser.py
+++ b/tests/tool_use/test_seed_oss_tool_parser.py
@@ -15,8 +15,9 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser import SeedOssToolParser
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -256,7 +257,7 @@ def test_streaming_tool_calls_no_tools(seed_oss_tool_parser):
 
 def stream_delta_message_generator(
     seed_oss_tool_parser: SeedOssToolParser,
-    seed_oss_tokenizer: AnyTokenizer,
+    seed_oss_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/test_xlam_tool_parser.py b/tests/tool_use/test_xlam_tool_parser.py
index 122b427d6040..fdcdd4038131 100644
--- a/tests/tool_use/test_xlam_tool_parser.py
+++ b/tests/tool_use/test_xlam_tool_parser.py
@@ -13,8 +13,9 @@
     ToolCall,
 )
 from vllm.entrypoints.openai.tool_parsers.xlam_tool_parser import xLAMToolParser
-from vllm.transformers_utils.detokenizer_utils import detokenize_incrementally
-from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.detokenizer_utils import detokenize_incrementally
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 pytestmark = pytest.mark.cpu_test
 
@@ -49,7 +50,7 @@ def assert_tool_calls(
 
 def stream_delta_message_generator(
     xlam_tool_parser: xLAMToolParser,
-    xlam_tokenizer: AnyTokenizer,
+    xlam_tokenizer: TokenizerLike,
     model_output: str,
     request: ChatCompletionRequest | None = None,
 ) -> Generator[DeltaMessage, None, None]:
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 38def6f874d7..7584b903156b 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -128,6 +128,12 @@ def ensure_system_prompt(
         "arguments": [
             "--enforce-eager",
             "--no-enable-prefix-caching",
+            "--tokenizer_mode",
+            "hf",
+            "--load_format",
+            "hf",
+            "--config_format",
+            "hf",
             "--tool-call-parser",
             "mistral",
             "--chat-template",
@@ -140,21 +146,22 @@ def ensure_system_prompt(
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
         "to the user's question - just respond to it normally.",
     },
-    # V1 Test: Passing locally but failing in CI. This runs the
-    # V0 Engine because of CPU offloading. Need to debug why.
+    # FIXME: This test currently fails, need to debug why.
     # "granite20b": {
-    #     "model":
-    #     "mbayser/granite-20b-functioncalling-FP8-KV",
+    #     "model": "mbayser/granite-20b-functioncalling-FP8-KV",
     #     "arguments": [
-    #         "--tool-call-parser", "granite-20b-fc", "--chat-template",
-    #         str(VLLM_PATH /
-    #             "examples/tool_chat_template_granite_20b_fc.jinja"),
-    #         "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
+    #         "--tool-call-parser",
+    #         "granite-20b-fc",
+    #         "--chat-template",
+    #         str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja"),
+    #         "--max_num_seqs",
+    #         "1",
+    #         "--enforce-eager",
+    #         "--cpu-offload-gb",
+    #         "20",
     #     ],
-    #     "supports_parallel":
-    #     False,
-    #     "supports_rocm":
-    #     False,
+    #     "supports_parallel": False,
+    #     "supports_rocm": False,
     # },
     "granite-3.0-8b": {
         "model": "ibm-granite/granite-3.0-8b-instruct",
diff --git a/tests/tokenization/test_get_eos.py b/tests/transformers_utils/test_config.py
similarity index 96%
rename from tests/tokenization/test_get_eos.py
rename to tests/transformers_utils/test_config.py
index 921d77b1b335..7b56c9f0189d 100644
--- a/tests/tokenization/test_get_eos.py
+++ b/tests/transformers_utils/test_config.py
@@ -3,7 +3,7 @@
 """
 This test file includes some cases where it is inappropriate to
 only get the `eos_token_id` from the tokenizer as defined by
-{meth}`vllm.LLMEngine._get_eos_token_id`.
+`vllm.LLMEngine._get_eos_token_id`.
 """
 
 from vllm.transformers_utils.config import try_get_generation_config
diff --git a/tests/transformers_utils/test_get_processor_kwargs_from_processor.py b/tests/transformers_utils/test_processor.py
similarity index 100%
rename from tests/transformers_utils/test_get_processor_kwargs_from_processor.py
rename to tests/transformers_utils/test_processor.py
diff --git a/tests/transformers_utils/test_repo_utils.py b/tests/transformers_utils/test_repo_utils.py
new file mode 100644
index 000000000000..7107ad0f7b99
--- /dev/null
+++ b/tests/transformers_utils/test_repo_utils.py
@@ -0,0 +1,62 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+
+
+@pytest.mark.parametrize(
+    "allow_patterns,expected_relative_files",
+    [
+        (
+            ["*.json", "correct*.txt"],
+            ["json_file.json", "subfolder/correct.txt", "correct_2.txt"],
+        ),
+    ],
+)
+def test_list_filtered_repo_files(
+    allow_patterns: list[str], expected_relative_files: list[str]
+):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Prep folder and files
+        path_tmp_dir = Path(tmp_dir)
+        subfolder = path_tmp_dir / "subfolder"
+        subfolder.mkdir()
+        (path_tmp_dir / "json_file.json").touch()
+        (path_tmp_dir / "correct_2.txt").touch()
+        (path_tmp_dir / "uncorrect.txt").touch()
+        (path_tmp_dir / "uncorrect.jpeg").touch()
+        (subfolder / "correct.txt").touch()
+        (subfolder / "uncorrect_sub.txt").touch()
+
+        def _glob_path() -> list[str]:
+            return [
+                str(file.relative_to(path_tmp_dir))
+                for file in path_tmp_dir.glob("**/*")
+                if file.is_file()
+            ]
+
+        # Patch list_repo_files called by fn
+        with patch(
+            "vllm.transformers_utils.repo_utils.list_repo_files",
+            MagicMock(return_value=_glob_path()),
+        ) as mock_list_repo_files:
+            out_files = sorted(
+                list_filtered_repo_files(
+                    tmp_dir, allow_patterns, "revision", "model", "token"
+                )
+            )
+        assert out_files == sorted(expected_relative_files)
+        assert mock_list_repo_files.call_count == 1
+        assert mock_list_repo_files.call_args_list[0] == call(
+            repo_id=tmp_dir,
+            revision="revision",
+            repo_type="model",
+            token="token",
+        )
diff --git a/tests/transformers_utils/test_utils.py b/tests/transformers_utils/test_utils.py
index beaef04d766b..a8d0b9be9ec2 100644
--- a/tests/transformers_utils/test_utils.py
+++ b/tests/transformers_utils/test_utils.py
@@ -1,8 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from unittest.mock import patch
 
+import pytest
 
-from vllm.transformers_utils.utils import is_cloud_storage, is_gcs, is_s3
+from vllm.transformers_utils.utils import (
+    is_cloud_storage,
+    is_gcs,
+    is_gguf,
+    is_remote_gguf,
+    is_s3,
+    split_remote_gguf,
+)
 
 
 def test_is_gcs():
@@ -24,3 +34,143 @@ def test_is_cloud_storage():
     assert is_cloud_storage("s3://model-path/path-to-model")
     assert not is_cloud_storage("/unix/local/path")
     assert not is_cloud_storage("nfs://nfs-fqdn.local")
+
+
+class TestIsRemoteGGUF:
+    """Test is_remote_gguf utility function."""
+
+    def test_is_remote_gguf_with_colon_and_slash(self):
+        """Test is_remote_gguf with repo_id:quant_type format."""
+        # Valid quant types
+        assert is_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_remote_gguf("user/repo:Q2_K")
+        assert is_remote_gguf("repo/model:Q4_K")
+        assert is_remote_gguf("repo/model:Q8_0")
+
+        # Invalid quant types should return False
+        assert not is_remote_gguf("repo/model:quant")
+        assert not is_remote_gguf("repo/model:INVALID")
+        assert not is_remote_gguf("repo/model:invalid_type")
+
+    def test_is_remote_gguf_without_colon(self):
+        """Test is_remote_gguf without colon."""
+        assert not is_remote_gguf("repo/model")
+        assert not is_remote_gguf("unsloth/Qwen3-0.6B-GGUF")
+
+    def test_is_remote_gguf_without_slash(self):
+        """Test is_remote_gguf without slash."""
+        assert not is_remote_gguf("model.gguf")
+        # Even with valid quant_type, no slash means not remote GGUF
+        assert not is_remote_gguf("model:IQ1_S")
+        assert not is_remote_gguf("model:quant")
+
+    def test_is_remote_gguf_local_path(self):
+        """Test is_remote_gguf with local file path."""
+        assert not is_remote_gguf("/path/to/model.gguf")
+        assert not is_remote_gguf("./model.gguf")
+
+    def test_is_remote_gguf_with_path_object(self):
+        """Test is_remote_gguf with Path object."""
+        assert is_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert not is_remote_gguf(Path("repo/model"))
+
+    def test_is_remote_gguf_with_http_https(self):
+        """Test is_remote_gguf with HTTP/HTTPS URLs."""
+        # HTTP/HTTPS URLs should return False even with valid quant_type
+        assert not is_remote_gguf("http://example.com/repo/model:IQ1_S")
+        assert not is_remote_gguf("https://huggingface.co/repo/model:Q2_K")
+        assert not is_remote_gguf("http://repo/model:Q4_K")
+        assert not is_remote_gguf("https://repo/model:Q8_0")
+
+    def test_is_remote_gguf_with_cloud_storage(self):
+        """Test is_remote_gguf with cloud storage paths."""
+        # Cloud storage paths should return False even with valid quant_type
+        assert not is_remote_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_remote_gguf("gs://bucket/repo/model:Q2_K")
+        assert not is_remote_gguf("s3://repo/model:Q4_K")
+        assert not is_remote_gguf("gs://repo/model:Q8_0")
+
+
+class TestSplitRemoteGGUF:
+    """Test split_remote_gguf utility function."""
+
+    def test_split_remote_gguf_valid(self):
+        """Test split_remote_gguf with valid repo_id:quant_type format."""
+        repo_id, quant_type = split_remote_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+        repo_id, quant_type = split_remote_gguf("repo/model:Q2_K")
+        assert repo_id == "repo/model"
+        assert quant_type == "Q2_K"
+
+    def test_split_remote_gguf_with_path_object(self):
+        """Test split_remote_gguf with Path object."""
+        repo_id, quant_type = split_remote_gguf(Path("unsloth/Qwen3-0.6B-GGUF:IQ1_S"))
+        assert repo_id == "unsloth/Qwen3-0.6B-GGUF"
+        assert quant_type == "IQ1_S"
+
+    def test_split_remote_gguf_invalid(self):
+        """Test split_remote_gguf with invalid format."""
+        # Invalid format (no colon) - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model")
+
+        # Invalid quant type - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("repo/model:INVALID_TYPE")
+
+        # HTTP URL - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("http://repo/model:IQ1_S")
+
+        # Cloud storage - is_remote_gguf returns False
+        with pytest.raises(ValueError, match="Wrong GGUF model"):
+            split_remote_gguf("s3://bucket/repo/model:Q2_K")
+
+
+class TestIsGGUF:
+    """Test is_gguf utility function."""
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=True)
+    def test_is_gguf_with_local_file(self, mock_check_gguf):
+        """Test is_gguf with local GGUF file."""
+        assert is_gguf("/path/to/model.gguf")
+        assert is_gguf("./model.gguf")
+
+    def test_is_gguf_with_remote_gguf(self):
+        """Test is_gguf with remote GGUF format."""
+        # Valid remote GGUF format (repo_id:quant_type with valid quant_type)
+        assert is_gguf("unsloth/Qwen3-0.6B-GGUF:IQ1_S")
+        assert is_gguf("repo/model:Q2_K")
+        assert is_gguf("repo/model:Q4_K")
+
+        # Invalid quant_type should return False
+        assert not is_gguf("repo/model:quant")
+        assert not is_gguf("repo/model:INVALID")
+
+    @patch("vllm.transformers_utils.utils.check_gguf_file", return_value=False)
+    def test_is_gguf_false(self, mock_check_gguf):
+        """Test is_gguf returns False for non-GGUF models."""
+        assert not is_gguf("unsloth/Qwen3-0.6B")
+        assert not is_gguf("repo/model")
+        assert not is_gguf("model")
+
+    def test_is_gguf_edge_cases(self):
+        """Test is_gguf with edge cases."""
+        # Empty string
+        assert not is_gguf("")
+
+        # Only colon, no slash (even with valid quant_type)
+        assert not is_gguf("model:IQ1_S")
+
+        # Only slash, no colon
+        assert not is_gguf("repo/model")
+
+        # HTTP/HTTPS URLs
+        assert not is_gguf("http://repo/model:IQ1_S")
+        assert not is_gguf("https://repo/model:Q2_K")
+
+        # Cloud storage
+        assert not is_gguf("s3://bucket/repo/model:IQ1_S")
+        assert not is_gguf("gs://bucket/repo/model:Q2_K")
diff --git a/tests/utils.py b/tests/utils.py
index 848c4efa8bcd..1745e7f2e6aa 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -683,7 +683,7 @@ def compare_all_settings(
                 results += _test_image_text(
                     client,
                     model,
-                    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+                    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
                 )
             elif method == "encode":
                 results += _test_embeddings(client, model, prompt)
@@ -1082,6 +1082,13 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator:
     )
 
 
+requires_fp8 = pytest.mark.skipif(
+    not current_platform.supports_fp8(),
+    reason="FP8 is not supported on this GPU (requires Hopper or "
+    "Ada architecture, compute capability 8.9+)",
+)
+
+
 def large_gpu_test(*, min_gb: int):
     """
     Decorate a test to be skipped if no GPU is available or it does not have
diff --git a/tests/utils_/test_argparse_utils.py b/tests/utils_/test_argparse_utils.py
index 3310753d2b6d..2d969b8c9347 100644
--- a/tests/utils_/test_argparse_utils.py
+++ b/tests/utils_/test_argparse_utils.py
@@ -10,7 +10,7 @@
 from transformers import AutoTokenizer
 from pydantic import ValidationError
 
-from vllm.transformers_utils.detokenizer_utils import convert_ids_list_to_tokens
+from vllm.tokenizers.detokenizer_utils import convert_ids_list_to_tokens
 
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from ..utils import flat_product
@@ -27,7 +27,8 @@ def parser():
     parser.add_argument("--batch-size", type=int)
     parser.add_argument("--enable-feature", action="store_true")
     parser.add_argument("--hf-overrides", type=json.loads)
-    parser.add_argument("-O", "--compilation-config", type=json.loads)
+    parser.add_argument("-cc", "--compilation-config", type=json.loads)
+    parser.add_argument("--optimization-level", type=int)
     return parser
 
 
@@ -166,8 +167,8 @@ def test_dict_args(parser):
         "--hf-overrides.key2.key4",
         "val3",
         # Test compile config and compilation mode
-        "-O.use_inductor=true",
-        "-O.backend",
+        "-cc.use_inductor_graph_partition=true",
+        "-cc.backend",
         "custom",
         "-O1",
         # Test = sign
@@ -190,9 +191,9 @@ def test_dict_args(parser):
         "--hf_overrides.key14.key15",
         "-minus.and.dot",
         # Test array values
-        "-O.custom_ops+",
+        "-cc.custom_ops+",
         "-quant_fp8",
-        "-O.custom_ops+=+silu_mul,-rms_norm",
+        "-cc.custom_ops+=+silu_mul,-rms_norm",
     ]
     parsed_args = parser.parse_args(args)
     assert parsed_args.model_name == "something.something"
@@ -217,9 +218,9 @@ def test_dict_args(parser):
             "key15": "-minus.and.dot",
         },
     }
+    assert parsed_args.optimization_level == 1
     assert parsed_args.compilation_config == {
-        "mode": 1,
-        "use_inductor": True,
+        "use_inductor_graph_partition": True,
         "backend": "custom",
         "custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
     }
@@ -233,7 +234,7 @@ def test_duplicate_dict_args(caplog_vllm, parser):
         "--hf-overrides.key1",
         "val2",
         "-O1",
-        "-O.mode",
+        "-cc.mode",
         "2",
         "-O3",
     ]
@@ -241,12 +242,13 @@ def test_duplicate_dict_args(caplog_vllm, parser):
     parsed_args = parser.parse_args(args)
     # Should be the last value
     assert parsed_args.hf_overrides == {"key1": "val2"}
-    assert parsed_args.compilation_config == {"mode": 3}
+    assert parsed_args.optimization_level == 3
+    assert parsed_args.compilation_config == {"mode": 2}
 
     assert len(caplog_vllm.records) == 1
     assert "duplicate" in caplog_vllm.text
     assert "--hf-overrides.key1" in caplog_vllm.text
-    assert "-O.mode" in caplog_vllm.text
+    assert "--optimization-level" in caplog_vllm.text
 
 
 def test_model_specification(
@@ -378,29 +380,29 @@ def test_load_config_file(tmp_path):
 
 
 def test_compilation_mode_string_values(parser):
-    """Test that -O.mode accepts both integer and string mode values."""
-    args = parser.parse_args(["-O.mode", "0"])
+    """Test that -cc.mode accepts both integer and string mode values."""
+    args = parser.parse_args(["-cc.mode", "0"])
     assert args.compilation_config == {"mode": 0}
 
     args = parser.parse_args(["-O3"])
-    assert args.compilation_config == {"mode": 3}
+    assert args.optimization_level == 3
 
-    args = parser.parse_args(["-O.mode=NONE"])
+    args = parser.parse_args(["-cc.mode=NONE"])
     assert args.compilation_config == {"mode": "NONE"}
 
-    args = parser.parse_args(["-O.mode", "STOCK_TORCH_COMPILE"])
+    args = parser.parse_args(["-cc.mode", "STOCK_TORCH_COMPILE"])
     assert args.compilation_config == {"mode": "STOCK_TORCH_COMPILE"}
 
-    args = parser.parse_args(["-O.mode=DYNAMO_TRACE_ONCE"])
+    args = parser.parse_args(["-cc.mode=DYNAMO_TRACE_ONCE"])
     assert args.compilation_config == {"mode": "DYNAMO_TRACE_ONCE"}
 
-    args = parser.parse_args(["-O.mode", "VLLM_COMPILE"])
+    args = parser.parse_args(["-cc.mode", "VLLM_COMPILE"])
     assert args.compilation_config == {"mode": "VLLM_COMPILE"}
 
-    args = parser.parse_args(["-O.mode=none"])
+    args = parser.parse_args(["-cc.mode=none"])
     assert args.compilation_config == {"mode": "none"}
 
-    args = parser.parse_args(["-O.mode=vllm_compile"])
+    args = parser.parse_args(["-cc.mode=vllm_compile"])
     assert args.compilation_config == {"mode": "vllm_compile"}
 
 
@@ -456,3 +458,25 @@ def test_flat_product():
         (3, 4, "a", 5, 6),
         (3, 4, "b", 5, 6),
     ]
+
+
+def test_o_legacy_syntax_deprecation(caplog_vllm):
+    """Test that -O.* dotted syntax emits warnings and converts correctly to -cc syntax."""
+    parser = FlexibleArgumentParser()
+    parser.add_argument("-cc", "--compilation-config", type=json.loads)
+
+    # Test that -O.backend gets converted correctly AND emits warning
+    args = parser.parse_args(["-O.backend=eager"])
+    assert args.compilation_config == {"backend": "eager"}
+
+    # Check that deprecation warning was logged
+    assert len(caplog_vllm.records) >= 1
+    assert (
+        "The -O.* dotted syntax for --compilation-config is deprecated"
+        in caplog_vllm.text
+    )
+
+    # Test that -O.mode gets converted correctly
+    # Note: warning_once won't emit again in same session
+    args = parser.parse_args(["-O.mode=2"])
+    assert args.compilation_config == {"mode": 2}
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
index b46002c5fa8f..e7ec8380e0a8 100644
--- a/tests/v1/attention/test_attention_backends.py
+++ b/tests/v1/attention/test_attention_backends.py
@@ -74,6 +74,9 @@ def _convert_dtype_to_torch(dtype):
     ),
     "large_decode": BatchSpec(seq_lens=[2048] * 32, query_lens=[1] * 32),
     "large_prefill": BatchSpec(seq_lens=[4096] * 8, query_lens=[32] * 8),
+    "mixed_large": BatchSpec(
+        seq_lens=[1024, 2048, 4096, 1024, 2048, 4096], query_lens=[1, 1, 1, 32, 32, 32]
+    ),
     "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]),
     "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]),
 }
@@ -587,7 +590,14 @@ def causal_mask_mod(
 
 @pytest.mark.parametrize(
     "batch_spec_name",
-    ["small_decode", "small_prefill", "mixed_medium", "large_decode", "large_prefill"],
+    [
+        "small_decode",
+        "small_prefill",
+        "mixed_medium",
+        "large_decode",
+        "large_prefill",
+        "mixed_large",
+    ],
 )
 @pytest.mark.parametrize("model", ["microsoft/Phi-tiny-MoE-instruct"])
 @pytest.mark.parametrize("tensor_parallel_size", [1, 2, 4])
diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py
index 1bd05e6183dc..783e02ce89bd 100644
--- a/tests/v1/attention/test_mla_backends.py
+++ b/tests/v1/attention/test_mla_backends.py
@@ -61,7 +61,7 @@
 
 BACKEND_BLOCK_SIZES = {}
 for backend in BACKENDS_TO_TEST:
-    supported_sizes = backend.get_class().supported_kernel_block_sizes
+    supported_sizes = backend.get_class().get_supported_kernel_block_sizes()
     if supported_sizes:
         default_size = supported_sizes[0]
         block_size = (
diff --git a/tests/v1/attention/test_rocm_attention_backends_selection.py b/tests/v1/attention/test_rocm_attention_backends_selection.py
new file mode 100644
index 000000000000..77790be6f892
--- /dev/null
+++ b/tests/v1/attention/test_rocm_attention_backends_selection.py
@@ -0,0 +1,340 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for attention backend selectors."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.platforms import current_platform
+
+# ROCm-specific attention backend selection tests
+pytestmark = pytest.mark.skipif(
+    not current_platform.is_rocm(), reason="ROCm-specific tests"
+)
+
+
+@pytest.fixture
+def mock_vllm_config():
+    """Create a mock VllmConfig for testing."""
+    config = MagicMock()
+    config.model_config.dtype = torch.float16
+    config.model_config.hf_config.architectures = ["LlamaForCausalLM"]
+    config.cache_config.block_size = 16
+    return config
+
+
+@pytest.fixture
+def mock_on_gfx9():
+    """Mock the on_gfx9 function to return True."""
+    with patch("vllm.platforms.rocm.on_gfx9", return_value=True):
+        yield
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, expected_backend_path",
+    [
+        # Test Case: Explicit FLEX_ATTENTION backend
+        (
+            {},
+            "FLEX_ATTENTION",
+            AttentionBackendEnum.FLEX_ATTENTION.get_path(),
+        ),
+        # Test Case 1: Default (no env vars, no explicit backend)
+        (
+            {},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 2: Explicit TRITON_ATTN backend
+        (
+            {},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 3: Explicit ROCM_ATTN backend
+        (
+            {},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 4: Explicit ROCM_AITER_FA backend
+        (
+            {},
+            "ROCM_AITER_FA",
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 5: Explicit ROCM_AITER_UNIFIED_ATTN backend
+        (
+            {},
+            "ROCM_AITER_UNIFIED_ATTN",
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1
+        # (defaults to AITER FA when MHA not explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=1
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "1"},
+            None,
+            AttentionBackendEnum.ROCM_AITER_FA.get_path(),
+        ),
+        # Test Case 8: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
+        (
+            {
+                "VLLM_ROCM_USE_AITER": "1",
+                "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION": "1",
+            },
+            None,
+            AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path(),
+        ),
+        # Test Case 9: VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1
+        (
+            {"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+            None,
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+        # Test Case 10: VLLM_ROCM_USE_AITER=1 + explicit TRITON_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_ATTN",
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 11: VLLM_ROCM_USE_AITER=1 + VLLM_ROCM_USE_AITER_MHA=0
+        # (explicitly disabled)
+        (
+            {"VLLM_ROCM_USE_AITER": "1", "VLLM_ROCM_USE_AITER_MHA": "0"},
+            None,
+            AttentionBackendEnum.TRITON_ATTN.get_path(),
+        ),
+        # Test Case 12: VLLM_ROCM_USE_AITER=1 + explicit ROCM_ATTN
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "ROCM_ATTN",
+            AttentionBackendEnum.ROCM_ATTN.get_path(),
+        ),
+    ],
+)
+def test_standard_attention_backend_selection(
+    env_vars,
+    selected_backend,
+    expected_backend_path,
+    mock_vllm_config,
+    mock_on_gfx9,
+    monkeypatch,
+):
+    """Test standard attention backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars to ensure they're picked up
+    # Reload envs to pick up new environment variables
+    import importlib
+
+    import vllm.envs as envs
+
+    importlib.reload(envs)
+
+    # Convert string backend to enum if provided
+    backend_enum = None
+    if selected_backend:
+        backend_enum = getattr(AttentionBackendEnum, selected_backend)
+
+    # Get the backend class path
+    from vllm.platforms.rocm import RocmPlatform
+
+    backend_path = RocmPlatform.get_attn_backend_cls(
+        selected_backend=backend_enum,
+        head_size=128,
+        dtype=torch.float16,
+        kv_cache_dtype="auto",
+        block_size=16,
+        use_mla=False,
+        has_sink=False,
+        use_sparse=False,
+    )
+    assert backend_path == expected_backend_path
+
+
+@pytest.mark.parametrize(
+    "env_vars, selected_backend, block_size, expected_backend_path, should_raise",
+    [
+        # Test Case 1: TRITON_MLA with block_size != 1
+        (
+            {},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 2: TRITON_MLA with block_size == 1 (should raise)
+        (
+            {},
+            "TRITON_MLA",
+            1,
+            None,
+            True,
+        ),
+        # Test Case 3: ROCM_AITER_MLA with block_size == 1
+        (
+            {},
+            "ROCM_AITER_MLA",
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 4: ROCM_AITER_MLA with block_size != 1 (should raise)
+        (
+            {},
+            "ROCM_AITER_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 5: VLLM_ROCM_USE_AITER=1 with block_size == 1
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            1,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 6: VLLM_ROCM_USE_AITER=1 with block_size == 16
+        # (should use ROCM_AITER_MLA now, as it supports block_size 16)
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            None,
+            16,
+            AttentionBackendEnum.ROCM_AITER_MLA.get_path(),
+            False,
+        ),
+        # Test Case 7: VLLM_ROCM_USE_AITER=1 + explicit TRITON_MLA
+        (
+            {"VLLM_ROCM_USE_AITER": "1"},
+            "TRITON_MLA",
+            16,
+            AttentionBackendEnum.TRITON_MLA.get_path(),
+            False,
+        ),
+        # Test Case 8: Explicit ROCM_AITER_TRITON_MLA
+        (
+            {},
+            "ROCM_AITER_TRITON_MLA",
+            16,
+            AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path(),
+            False,
+        ),
+    ],
+)
+def test_mla_backend_selection(
+    env_vars,
+    selected_backend,
+    block_size,
+    expected_backend_path,
+    should_raise,
+    mock_vllm_config,
+    monkeypatch,
+):
+    """Test MLA backend selection with various configurations."""
+    # Set environment variables
+    for key, value in env_vars.items():
+        monkeypatch.setenv(key, value)
+
+    # Import after setting env vars
+    # Reload envs
+    import importlib
+
+    import vllm.envs as envs
+
+    importlib.reload(envs)
+
+    # Mock is_aiter_mla_enabled based on env vars and block_size
+    aiter_enabled = env_vars.get("VLLM_ROCM_USE_AITER") == "1"
+
+    mock_rocm_ops = MagicMock()
+    mock_rocm_ops.is_mla_enabled.return_value = aiter_enabled
+    mock_aiter_module = MagicMock()
+    mock_aiter_module.rocm_aiter_ops = mock_rocm_ops
+
+    with patch.dict("sys.modules", {"vllm._aiter_ops": mock_aiter_module}):
+        # Convert string backend to enum if provided
+        backend_enum = None
+        if selected_backend:
+            backend_enum = getattr(AttentionBackendEnum, selected_backend)
+
+        from vllm.platforms.rocm import RocmPlatform
+
+        if should_raise:
+            with pytest.raises(ValueError):
+                RocmPlatform.get_attn_backend_cls(
+                    selected_backend=backend_enum,
+                    head_size=128,
+                    dtype=torch.float16,
+                    kv_cache_dtype="auto",
+                    block_size=block_size,
+                    use_mla=True,
+                    has_sink=False,
+                    use_sparse=False,
+                )
+        else:
+            backend_path = RocmPlatform.get_attn_backend_cls(
+                selected_backend=backend_enum,
+                head_size=128,
+                dtype=torch.float16,
+                kv_cache_dtype="auto",
+                block_size=block_size,
+                use_mla=True,
+                has_sink=False,
+                use_sparse=False,
+            )
+            assert backend_path == expected_backend_path
+
+
+def test_aiter_fa_requires_gfx9(mock_vllm_config):
+    """Test that ROCM_AITER_FA requires gfx9 architecture."""
+    from vllm.platforms.rocm import RocmPlatform
+
+    # Mock on_gfx9 to return False
+    with (
+        patch("vllm.platforms.rocm.on_gfx9", return_value=False),
+        pytest.raises(
+            ValueError,
+            match="only supported on gfx9",
+        ),
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=AttentionBackendEnum.ROCM_AITER_FA,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=False,
+        )
+
+
+def test_sparse_not_supported(mock_vllm_config):
+    """Test that sparse attention is not supported on ROCm."""
+    from vllm.platforms.rocm import RocmPlatform
+
+    with pytest.raises(
+        AssertionError, match="Sparse MLA backend on ROCm only supports block size 1"
+    ):
+        RocmPlatform.get_attn_backend_cls(
+            selected_backend=None,
+            head_size=128,
+            dtype=torch.float16,
+            kv_cache_dtype="auto",
+            block_size=16,
+            use_mla=False,
+            has_sink=False,
+            use_sparse=True,
+        )
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
index dea89babd4b4..df3d53332c7c 100644
--- a/tests/v1/attention/utils.py
+++ b/tests/v1/attention/utils.py
@@ -340,4 +340,11 @@ class BackendConfig:
             "cudagraph_mode": "FULL_AND_PIECEWISE",
         },
     ),
+    "RocmAttn": BackendConfig(
+        name="RocmAttn",
+        env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
+        comp_config={
+            "cudagraph_mode": "FULL",
+        },
+    ),
 }
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index df6a5f109874..58a7a2692bfc 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -55,7 +55,7 @@
 def _auto_init_hash_fn(request):
     hash_fn: Callable
     if "hash_fn" in request.fixturenames:
-        hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
+        hash_fn = request.getfixturevalue("hash_fn")
     else:
         hash_fn = sha256
     init_none_hash(hash_fn)
@@ -1248,7 +1248,9 @@ def test_allocate_with_lookahead():
     )
 
     # Test case 1: Requires additional lookahead tokens
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1257,7 +1259,9 @@ def test_allocate_with_lookahead():
     assert len(blocks.get_block_ids()[0]) == 2  # ceil(5/4)=2 blocks
 
     # Test case 2: With precomputed blocks
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     # required_blocks = ceil((3 + 2) /4) = 2
     blocks = kv_cache_manager.allocate_slots(
         request,
@@ -1268,7 +1272,9 @@ def test_allocate_with_lookahead():
 
     # Test case 3: With precomputed blocks
     # required_blocks = ceil((3 + 4) / 4) = 2
-    kv_cache_manager = KVCacheManager(kv_cache_config=config, max_model_len=100)
+    kv_cache_manager = KVCacheManager(
+        kv_cache_config=config, max_model_len=100, hash_block_size=block_size
+    )
     blocks = kv_cache_manager.allocate_slots(
         request,
         num_new_tokens=3,
@@ -1436,7 +1442,67 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
-    # different hidden size
+    # 6 full + 5 sliding, pad to 6 full + 6 sliding. This is a typical case for gpt-oss
+    # eagle where there is only one more full attention layer than sliding window layers
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+        "layer_3": new_kv_cache_spec(),
+        "layer_4": new_kv_cache_spec(),
+        "layer_5": new_kv_cache_spec(),
+        "layer_6": new_kv_cache_spec(),
+        "layer_7": new_sliding_window_spec(),
+        "layer_8": new_sliding_window_spec(),
+        "layer_9": new_sliding_window_spec(),
+        "layer_10": new_sliding_window_spec(),
+        "layer_11": new_sliding_window_spec(),
+    }
+
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 6 * 32]
+    )[0]
+    print(kv_cache_config_hybrid)
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_1", "layer_7"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_2", "layer_8"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_3", "layer_9"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_4", "layer_10"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_5", "layer_11"],
+            ),
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32,
+                shared_by=["layer_6"],
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer_1", "layer_2", "layer_3", "layer_4", "layer_5", "layer_6"],
+                new_kv_cache_spec(),
+            ),
+            KVCacheGroupSpec(
+                ["layer_7", "layer_8", "layer_9", "layer_10", "layer_11"],
+                new_sliding_window_spec(),
+            ),
+        ],
+    )
+
+    # different hidden size but same type, use UniformTypeKVCacheSpecs
     kv_cache_specs_hybrid = {
         "layer_1": new_kv_cache_spec(head_size=128),
         "layer_2": new_kv_cache_spec(head_size=64),
@@ -1460,6 +1526,40 @@ def test_get_kv_cache_config_one_worker():
         ],
     )
 
+    # Different hidden size and different type, align by different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=32),
+    }
+    kv_cache_config_hybrid = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 32]
+    )[0]
+    assert kv_cache_config_hybrid == KVCacheConfig(
+        num_blocks=32,
+        kv_cache_tensors=[
+            KVCacheTensor(
+                size=mem_per_block_per_layer * 32, shared_by=["layer_1", "layer_2"]
+            ),
+        ],
+        kv_cache_groups=[
+            KVCacheGroupSpec(["layer_1"], new_kv_cache_spec(head_size=64)),
+            KVCacheGroupSpec(
+                ["layer_2"], new_sliding_window_spec(head_size=32, block_size=32)
+            ),
+        ],
+    )
+
+    # different hidden size that cannot be aligned by using different block size
+    kv_cache_specs_hybrid = {
+        "layer_1": new_kv_cache_spec(head_size=64),
+        "layer_2": new_sliding_window_spec(head_size=96),
+    }
+
+    with pytest.raises(NotImplementedError):
+        get_kv_cache_configs(
+            vllm_config, [kv_cache_specs_hybrid], [mem_per_block_per_layer * 2 * 32]
+        )[0]
+
     # Test num_gpu_blocks_override
     vllm_config.cache_config.num_gpu_blocks_override = 16
     kv_cache_config_override_blocks = get_kv_cache_configs(
diff --git a/tests/v1/core/test_output.py b/tests/v1/core/test_output.py
new file mode 100644
index 000000000000..9dea19320e61
--- /dev/null
+++ b/tests/v1/core/test_output.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.v1.core.sched.output import NewRequestData
+
+
+def _create_new_requests_data(prompt_embeds: torch.Tensor | None) -> NewRequestData:
+    return NewRequestData(
+        req_id="test_req",
+        prompt_token_ids=None,
+        mm_features=[],
+        sampling_params=None,
+        pooling_params=None,
+        block_ids=([],),
+        num_computed_tokens=0,
+        lora_request=None,
+        prompt_embeds=prompt_embeds,
+    )
+
+
+def test_repr_with_none() -> None:
+    """Test repr when prompt_embeds is None."""
+    new_requests_data = _create_new_requests_data(None)
+
+    assert "prompt_embeds_shape=None" in repr(new_requests_data)
+    assert "prompt_embeds_shape=None" in new_requests_data.anon_repr()
+
+
+def test_repr_with_multi_element_tensor() -> None:
+    """Test repr when prompt_embeds is a multi-element tensor."""
+    prompt_embeds = torch.randn(10, 768)
+    new_requests_data = _create_new_requests_data(prompt_embeds)
+
+    assert "prompt_embeds_shape=torch.Size([10, 768])" in repr(new_requests_data)
+    assert "prompt_embeds_shape=torch.Size([10, 768])" in new_requests_data.anon_repr()
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 2291f363731f..0880a17c78d4 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -45,7 +45,7 @@
 def _auto_init_hash_fn(request):
     hash_fn: Callable
     if "hash_fn" in request.fixturenames:
-        hash_fn = init_none_hash(request.getfixturevalue("hash_fn"))
+        hash_fn = request.getfixturevalue("hash_fn")
     else:
         hash_fn = sha256
     init_none_hash(hash_fn)
@@ -134,6 +134,7 @@ def test_prefill(hash_fn):
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -256,6 +257,7 @@ def test_prefill_hybrid_model():
         make_kv_cache_config_hybrid_model(block_size, 21),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     hash_fn = sha256
@@ -416,6 +418,7 @@ def test_prefill_plp():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # the default hash function is sha256
     hash_fn = sha256
@@ -523,6 +526,7 @@ def test_decode():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Complete 3 blocks (48 tokens)
@@ -585,6 +589,7 @@ def test_evict():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     last_token_id = 5 * 16 + 7
@@ -643,6 +648,7 @@ def test_hash_block_correct_reuse():
         make_kv_cache_config(16, 2),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Allocate 1 block and cache it.
@@ -683,6 +689,7 @@ def test_computed_blocks_not_evicted():
         make_kv_cache_config(block_size, 3),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Allocate a block and cache it.
@@ -741,6 +748,7 @@ def test_basic_prefix_caching_disabled():
         make_kv_cache_config(block_size, 5),
         max_model_len=8192,
         enable_caching=False,
+        hash_block_size=block_size,
     )
 
     req1 = make_request(
@@ -790,6 +798,7 @@ def test_cache_blocks(hash_fn):
     block_pool = BlockPool(
         num_gpu_blocks=5,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # Req:
     #  Block 0: [0, 1, 2, 3]
@@ -833,7 +842,9 @@ def test_cache_blocks_multi_group():
     This tests that blocks are cached correctly for different kv cache groups.
     """
     block_size = 4
-    block_pool = BlockPool(num_gpu_blocks=10, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=10, enable_caching=True, hash_block_size=block_size
+    )
 
     # Req:
     #  Block 0/4: [0, 1, 2, 3]
@@ -921,6 +932,7 @@ def test_mm_prefix_caching():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # Common prompt tokens (T is text tokens and P is image placeholder tokens)
@@ -1020,6 +1032,7 @@ def test_cache_key_salting():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     # 3 complete blocks and an incomplete block with 11 tokens.
@@ -1101,6 +1114,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
     # Complete 3 blocks (48 tokens)
     # | Common-0 | Common-1 | Common-2 | ... |
@@ -1173,6 +1187,7 @@ def test_reset_prefix_cache():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
     )
 
     full_block_token_ids = [i for i in range(3) for _ in range(16)]
@@ -1213,6 +1228,7 @@ def test_prefix_cache_stats_disabled():
         make_kv_cache_config(block_size, 11),
         max_model_len=8192,
         enable_caching=True,
+        hash_block_size=block_size,
         log_stats=False,  # Disable logging stats
     )
     assert manager.prefix_cache_stats is None
@@ -1232,7 +1248,7 @@ def test_prefix_cache_stats_disabled():
 
 
 def test_maybe_evict_cached_block():
-    pool = BlockPool(num_gpu_blocks=4, enable_caching=True)
+    pool = BlockPool(num_gpu_blocks=4, enable_caching=True, hash_block_size=16)
     block_hash0 = make_block_hash_with_group_id(BlockHash(b"10"), 1000)
     block_hash1 = make_block_hash_with_group_id(BlockHash(b"20"), 2000)
     block_hash2 = make_block_hash_with_group_id(BlockHash(b"30"), 3000)
@@ -1293,6 +1309,7 @@ def test_kv_cache_events(blocks_to_cache: int):
         max_model_len=8192,
         enable_caching=True,
         enable_kv_cache_events=True,
+        hash_block_size=block_size,
     )
 
     num_tokens = block_size * blocks_to_cache
@@ -1351,6 +1368,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int):
         max_model_len=8192,
         enable_caching=True,
         enable_kv_cache_events=True,
+        hash_block_size=block_size,
     )
 
     # Test with LoRA request
@@ -1405,6 +1423,7 @@ def test_eagle_enabled_removes_last_block():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
 
     # Request with 3 full blocks (48 tokens)
@@ -1437,6 +1456,7 @@ def test_eagle_with_partial_blocks():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
     # 2 full blocks + 5 tokens (non-divisible length)
     token_ids = [0] * (2 * block_size + 5)
@@ -1476,6 +1496,7 @@ def test_eagle_with_sliding_window():
         max_model_len=8192,
         enable_caching=True,
         use_eagle=True,
+        hash_block_size=block_size,
     )
 
     # 2 full blocks + 5 tokens (non-divisible length)
@@ -1522,6 +1543,76 @@ def test_eagle_with_sliding_window():
     assert num_tokens == 0
 
 
+def test_different_block_size():
+    block_size = 16
+    # full attention and sliding window attention layers have the same page size:
+    # (32 tokens/block * float16 token, vs. 16 tokens/block * float32 token)
+    kv_cache_config = KVCacheConfig(
+        num_blocks=100,
+        kv_cache_tensors=[],
+        kv_cache_groups=[
+            KVCacheGroupSpec(
+                ["layer1"],
+                FullAttentionSpec(block_size * 2, 1, 1, torch.float16),
+            ),
+            KVCacheGroupSpec(
+                ["layer2"],
+                SlidingWindowSpec(
+                    block_size,
+                    1,
+                    1,
+                    torch.float32,
+                    sliding_window=2 * block_size,
+                ),
+            ),
+        ],
+    )
+    manager = KVCacheManager(
+        kv_cache_config=kv_cache_config,
+        max_model_len=8192,
+        enable_caching=True,
+        hash_block_size=block_size,
+    )
+
+    # 10 blocks of 16 tokens each. Token ids are not strictly aligned for each block.
+    common_token_ids = [i for i in range(10) for _ in range(block_size)]
+
+    req0 = make_request("0", common_token_ids, block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req0)
+    assert not computed_blocks.blocks[0]
+    assert not computed_blocks.blocks[1]
+    assert num_computed_tokens == 0
+    blocks = manager.allocate_slots(
+        req0, 7 * block_size, len(computed_blocks.blocks[0]) * 16, computed_blocks
+    )
+    assert blocks.get_block_ids() == ([1, 2, 3, 4], [5, 6, 7, 8, 9, 10, 11])
+    req1 = make_request("1", common_token_ids[: 7 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    req2 = make_request("2", common_token_ids[: 6 * block_size + 1], block_size, sha256)
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2)
+    assert len(computed_blocks.blocks[0]) == 3
+    assert len(computed_blocks.blocks[1]) == 6
+    assert num_computed_tokens == 6 * 16
+
+    # Evict some blocks to make sliding window cache hit length 5*16
+    # But should return 4 * 16 because full attention cache hit length must be
+    # a multiple of 32
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[6], 1), 11
+    )
+    manager.block_pool.cached_block_hash_to_block.pop(
+        make_block_hash_with_group_id(req1.block_hashes[5], 1), 10
+    )
+    computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1)
+    assert len(computed_blocks.blocks[0]) == 2
+    assert len(computed_blocks.blocks[1]) == 4
+    assert num_computed_tokens == 4 * 16
+
+
 def test_block_lookup_cache_single_block_per_key():
     cache = BlockHashToBlockMap()
     key0 = BlockHashWithGroupId(b"hash0")
diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py
index d5b829e79b8f..fe4153e60997 100644
--- a/tests/v1/core/test_scheduler.py
+++ b/tests/v1/core/test_scheduler.py
@@ -31,11 +31,11 @@
     KVCacheConfig,
     KVCacheGroupSpec,
 )
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.structured_output import StructuredOutputManager
 
-from .utils import EOS_TOKEN_ID, create_requests, create_scheduler
+from .utils import EOS_TOKEN_ID, create_requests, create_scheduler, mock_kv
 
 pytestmark = pytest.mark.cpu_test
 
@@ -76,11 +76,11 @@ def test_get_num_unfinished_requests():
 @pytest.mark.parametrize(
     "enable_prefix_caching, prompt_logprobs",
     [
-        (None, None),
+        (False, None),
         (True, 5),
     ],
 )
-def test_schedule(enable_prefix_caching: bool | None, prompt_logprobs: int | None):
+def test_schedule(enable_prefix_caching: bool, prompt_logprobs: int | None):
     """Test scheduling.
     Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
     """
@@ -582,12 +582,12 @@ def test_check_stop_min_tokens():
 @pytest.mark.parametrize(
     "enable_prefix_caching, prompt_logprobs",
     [
-        (None, None),
+        (False, None),
         (True, 5),
     ],
 )
 def test_schedule_concurrent_batches(
-    enable_prefix_caching: bool | None, prompt_logprobs: int | None
+    enable_prefix_caching: bool, prompt_logprobs: int | None
 ):
     scheduler = create_scheduler(
         max_num_batched_tokens=1024,
@@ -641,6 +641,34 @@ def test_schedule_concurrent_batches(
     scheduler.update_from_output(scheduler_output1, model_runner_output)
 
 
+@pytest.mark.parametrize("enable_chunked_prefill", [True, False])
+def test_schedule_order(enable_chunked_prefill: bool):
+    scheduler = create_scheduler(
+        max_num_batched_tokens=1024,
+        max_num_seqs=3,
+        enable_chunked_prefill=enable_chunked_prefill,
+    )
+
+    # long requests
+    requests = create_requests(num_requests=2, num_tokens=800)
+    # short requests
+    requests += create_requests(num_requests=2, num_tokens=10)
+
+    for request in requests:
+        scheduler.add_request(request)
+
+    scheduler_output1 = scheduler.schedule()
+
+    if enable_chunked_prefill:
+        # When enable chunked prefill, long requests will be chunked.
+        assert len(scheduler_output1.scheduled_new_reqs) == 2
+    else:
+        # When disable chunked prefill, should not skip the long requests,
+        # and scheduling subsequent short requests in advance,
+        # even though there is still token budgets remaining.
+        assert len(scheduler_output1.scheduled_new_reqs) == 1
+
+
 def test_preempt_during_execution():
     # NOTE(woosuk): The actual number of available blocks is 10 instead of 11
     # because block 0 is reserved as the null block.
@@ -888,27 +916,65 @@ def _step_until_done(
         all_finished = all_done
 
 
-def test_kv_connector_basic():
+def _step_until_kv_transfer_finished(scheduler: Scheduler, req_ids: list[str]):
+    """Cycle requests through a KV transfer cyle."""
+
+    # Requests should first transition to WAITING_FOR_REMOTE_KVS
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+    assert len(output.scheduled_new_reqs) == 0
+    for req in scheduler.requests.values():
+        assert req.status == RequestStatus.WAITING_FOR_REMOTE_KVS
+
+    # No model execution yet
+    EMPTY_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+    )
+    scheduler.update_from_output(output, EMPTY_OUTPUT)
+
+    # Simulate KV transfer completion using KVConnectorOutput.finished_recving
+    output = scheduler.schedule()
+    assert len(scheduler.waiting) == len(req_ids)
+    assert len(scheduler.running) == 0
+
+    MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
+        req_ids=[],
+        req_id_to_index={},
+        sampled_token_ids=[],
+        logprobs=None,
+        prompt_logprobs_dict={},
+        pooler_output=[],
+        kv_connector_output=KVConnectorOutput(finished_recving=req_ids),
+    )
+    scheduler.update_from_output(output, MODEL_RUNNER_OUTPUT)
+    for req_id in req_ids:
+        assert req_id in scheduler.finished_recving_kv_req_ids
+
+
+@pytest.mark.parametrize("is_async", [False, True])
+def test_kv_connector_basic(is_async: bool):
     """
     Test whether Scheduler with KVConnector schedules tokens, allocates
     memory, and cleans up requests as expected under normal operation.
     """
 
     # Setup Scheduler.
+    BLOCK_SIZE = 16
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
+        block_size=BLOCK_SIZE,
     )
     NUM_TOTAL_BLOCKS = scheduler.kv_cache_manager.block_pool.get_num_free_blocks()
-    BLOCK_SIZE = scheduler.cache_config.block_size
-
-    # Mock External Cache Hit.
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
-    )
 
     ######################################################
     # FIRST SET OF REQUESTS - External Hit Only
@@ -928,6 +994,9 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
@@ -978,6 +1047,9 @@ def test_kv_connector_basic():
         req_ids.append(request.request_id)
         req_to_index[request.request_id] = i
 
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
+
     MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
         req_ids=req_ids,
         req_id_to_index=req_to_index,
@@ -1013,24 +1085,20 @@ def test_kv_connector_basic():
     )
 
 
-def test_external_prefix_cache_metrics():
+@pytest.mark.parametrize("is_async", [False, True])
+def test_external_prefix_cache_metrics(is_async: bool):
     """
     Verify connector prefix cache metrics are updated
     correctly when the scheduler processes requests with KV connector hits.
     """
 
     # Setup Scheduler.
+    NUM_MATCHED_NEW_TOKENS = 4
     scheduler = create_scheduler(
         enable_prefix_caching=False,
-        use_kv_connector=True,
-    )
-
-    # Mock connector to simulate a partial external cache hit
-    NUM_MATCHED_NEW_TOKENS = 4
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
+        use_kv_connector=mock_kv(
+            matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=is_async
+        ),
     )
 
     # --- Prepare simple requests ---
@@ -1042,9 +1110,15 @@ def test_external_prefix_cache_metrics():
         num_tokens=NUM_TOKENS,
         max_tokens=MAX_TOKENS,
     )
+    req_ids = []
+    req_to_index = {}
+    for i, request in enumerate(requests):
+        scheduler.add_request(request)
+        req_ids.append(request.request_id)
+        req_to_index[request.request_id] = i
 
-    for req in requests:
-        scheduler.add_request(req)
+    if is_async:
+        _step_until_kv_transfer_finished(scheduler, req_ids)
 
     # --- Trigger scheduling and simulate model output ---
     output = scheduler.schedule()
@@ -1085,21 +1159,16 @@ def test_kv_connector_unable_to_allocate(use_ec_connector, ec_role):
     # Setup Scheduler With Mock External Cache Hit.
     BLOCK_SIZE = 4
     NUM_BLOCKS = 10
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
         # encoder connector should not affect test results
         use_ec_connector=use_ec_connector,
         ec_role=ec_role,
     )
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE * 2
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
-    )
 
     # Create two requests. The second request will not be able to
     # allocate slots because it will not have enough blocks.
@@ -1174,9 +1243,10 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
     BLOCK_SIZE = 2
     # NOTE: there is 1 null block, so this is 6 blocks.
     NUM_BLOCKS = 7
+    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
     scheduler = create_scheduler(
         enable_prefix_caching=True,
-        use_kv_connector=True,
+        use_kv_connector=mock_kv(matched_tokens=NUM_MATCHED_NEW_TOKENS, is_async=False),
         block_size=BLOCK_SIZE,
         num_blocks=NUM_BLOCKS,
         # encoder connector should not affect test results
@@ -1184,13 +1254,6 @@ def test_kv_connector_handles_preemption(use_ec_connector, ec_role):
         ec_role=ec_role,
     )
 
-    NUM_MATCHED_NEW_TOKENS = BLOCK_SIZE
-    scheduler.connector.get_num_new_matched_tokens = Mock(name="method")
-    scheduler.connector.get_num_new_matched_tokens.return_value = (
-        NUM_MATCHED_NEW_TOKENS,
-        False,
-    )
-
     # Create two requests.
     # Both can be scheduled at first, but the second request
     # will be preempted and re-scheduled.
@@ -1390,7 +1453,7 @@ def create_scheduler_with_priority(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: bool | None = None,
+    enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
     use_kv_connector: bool = False,
@@ -1409,7 +1472,7 @@ def create_scheduler_with_priority(
       max_num_batch_tokens: max num tokens to batch
       enable_prefix_caching: optionally force APC config
                              (True/False) or use default
-                             (None)
+                             (False)
 
     Returns:
       {class}`Scheduler` instance with priority scheduling
@@ -1432,17 +1495,12 @@ def create_scheduler_with_priority(
         seed=42,
     )
     # Cache config, optionally force APC
-    kwargs_cache = (
-        {}
-        if enable_prefix_caching is None
-        else {"enable_prefix_caching": enable_prefix_caching}
-    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
-        **kwargs_cache,
+        enable_prefix_caching=enable_prefix_caching,
     )
     kv_transfer_config = (
         KVTransferConfig(
@@ -2230,6 +2288,8 @@ def test_chunked_prefill_disabled_for_encoder_decoder(
     scheduler_config = SchedulerConfig(
         enable_chunked_prefill=enable_chunked_prefill,
         is_encoder_decoder=is_encoder_decoder,
+        # Must <= max_num_batched_tokens if chunked prefill is disabled
+        max_model_len=SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
     )
 
     # `is_encoder_decoder` should only be used during construction
@@ -2254,7 +2314,6 @@ def _validate_chunked_prefill_settings_for_encoder_decoder(
 ) -> None:
     """Validate chunked prefill settings in the scheduler config for
     encoder-decoder models."""
-    assert scheduler_config.chunked_prefill_enabled is expect_enabled
     assert scheduler_config.enable_chunked_prefill is expect_enabled
     if is_encoder_decoder:
         # Encoder-decoder models should automatically disable chunked multimodal
diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py
index a27f32938c08..e6a69dc8a949 100644
--- a/tests/v1/core/test_single_type_kv_cache_manager.py
+++ b/tests/v1/core/test_single_type_kv_cache_manager.py
@@ -41,7 +41,9 @@ def test_chunked_local_attention_possible_cached_prefix():
         attention_chunk_size=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_chunked_local_attention_manager(
         chunked_local_attention_spec, block_pool
     )
@@ -70,6 +72,7 @@ def run_one_case(block_is_cached, tail_token, expect_length):
             block_pool=block_pool,
             kv_cache_spec=chunked_local_attention_spec,
             use_eagle=False,
+            alignment_tokens=block_size,
         )[0]
         assert len(computed_blocks) == expect_length
 
@@ -111,7 +114,9 @@ def test_sliding_window_possible_cached_prefix():
         sliding_window=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
     def run_one_case(block_is_cached, expect_length):
@@ -138,6 +143,7 @@ def run_one_case(block_is_cached, expect_length):
             block_pool=block_pool,
             kv_cache_spec=sliding_window_spec,
             use_eagle=False,
+            alignment_tokens=block_size,
         )[0]
         assert len(computed_blocks) == expect_length
 
@@ -178,7 +184,7 @@ def test_chunked_local_attention_remove_skipped_blocks():
         attention_chunk_size=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
 
     manager = get_chunked_local_attention_manager(attention_spec, block_pool)
 
@@ -239,7 +245,7 @@ def test_sliding_window_remove_skipped_blocks():
         sliding_window=4,
     )
 
-    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True)
+    block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True, hash_block_size=2)
 
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
 
@@ -316,7 +322,9 @@ def test_get_num_blocks_to_allocate():
         sliding_window=4,  # Placeholder value, not related to test result
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_sliding_window_manager(sliding_window_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
     cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
@@ -341,7 +349,9 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
         attention_chunk_size=4,  # Placeholder value, not related to test result
     )
 
-    block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True)
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
     manager = get_chunked_local_attention_manager(attention_spec, block_pool)
     cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)]
     cached_blocks_2 = [block_pool.null_block for _ in range(5)] + [
diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py
index 3692e633322e..7537c7a60476 100644
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -3,6 +3,7 @@
 
 import torch
 
+from tests.v1.kv_connector.unit.utils import MockKVConfig
 from vllm.config import (
     CacheConfig,
     ECTransferConfig,
@@ -33,14 +34,19 @@
 EOS_TOKEN_ID = 50256
 
 
+def mock_kv(matched_tokens: int, is_async: bool):
+    return MockKVConfig(matched_tokens=matched_tokens, is_async=is_async)
+
+
 def create_scheduler(
     model: str = "facebook/opt-125m",
     max_num_seqs: int = 16,
     max_num_batched_tokens: int = 8192,
-    enable_prefix_caching: bool | None = None,
+    enable_chunked_prefill: bool = True,
+    enable_prefix_caching: bool = False,
     long_prefill_token_threshold: int = 0,
     disable_chunked_mm_input: bool = False,
-    use_kv_connector: bool = False,
+    use_kv_connector: None | bool | MockKVConfig = None,
     num_blocks: int = 10000,
     block_size: int = 16,
     max_model_len: int | None = None,
@@ -58,7 +64,7 @@ def create_scheduler(
       max_num_batch_tokens: max num tokens to batch
       enable_prefix_caching: optionally force APC config
                              (True/False) or use default
-                             (None)
+                             (False)
 
     Returns:
       {class}`Scheduler` instance
@@ -71,7 +77,7 @@ def create_scheduler(
         max_model_len=max_model_len,
         long_prefill_token_threshold=long_prefill_token_threshold,
         disable_chunked_mm_input=disable_chunked_mm_input,
-        enable_chunked_prefill=True,
+        enable_chunked_prefill=enable_chunked_prefill,
         async_scheduling=async_scheduling,
     )
     model_config = ModelConfig(
@@ -82,27 +88,29 @@ def create_scheduler(
         skip_tokenizer_init=skip_tokenizer_init,
     )
     # Cache config, optionally force APC
-    kwargs_cache = (
-        {}
-        if enable_prefix_caching is None
-        else {"enable_prefix_caching": enable_prefix_caching}
-    )
     cache_config = CacheConfig(
         block_size=block_size,
         gpu_memory_utilization=0.9,
         swap_space=0,
         cache_dtype="auto",
-        **kwargs_cache,
+        enable_prefix_caching=enable_prefix_caching,
     )
-    kv_transfer_config = (
-        KVTransferConfig(
+    kv_transfer_config = None
+    if isinstance(use_kv_connector, MockKVConfig):
+        kv_transfer_config = KVTransferConfig(
+            kv_connector="MockKVConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={
+                "matched_tokens": use_kv_connector.matched_tokens,
+                "is_async": use_kv_connector.is_async,
+            },
+        )
+    elif use_kv_connector:
+        kv_transfer_config = KVTransferConfig(
             kv_connector="SharedStorageConnector",
             kv_role="kv_both",
             kv_connector_extra_config={"shared_storage_path": "local_storage"},
         )
-        if use_kv_connector
-        else None
-    )
 
     speculative_config: SpeculativeConfig | None = None
     if num_speculative_tokens is not None:
diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py
index bb953e5c70c8..314e7094ef97 100644
--- a/tests/v1/cudagraph/test_cudagraph_dispatch.py
+++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py
@@ -42,12 +42,24 @@ def _create_vllm_config(
     mock_config.compilation_config = compilation_config
     mock_config.scheduler_config = SchedulerConfig(max_num_seqs=max_num_seqs)
     mock_config.parallel_config = ParallelConfig()
+    mock_config.speculative_config = None  # No speculative decoding
     if not lora_config:
         mock_config.lora_config = None
     # Mimic the behavior of VllmConfig.__post_init__()
     if compilation_config.mode == CompilationMode.VLLM_COMPILE:
         compilation_config.set_splitting_ops_for_v1()
 
+    # mimic VllmConfig.__post_init__
+    if compilation_config.cudagraph_capture_sizes:
+        compilation_config.max_cudagraph_capture_size = (
+            compilation_config.cudagraph_capture_sizes[-1]
+        )
+
+        compilation_config.post_init_cudagraph_sizes()
+        mock_config.pad_for_cudagraph = (
+            lambda batch_size: compilation_config.bs_to_padded_graph_size[batch_size]
+        )
+
     return mock_config
 
 
@@ -109,9 +121,11 @@ def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config):
         # 1. non-uniform batch, size in cudagraph size list
         desc_full_exact = BatchDescriptor(
             num_tokens=8,
-            uniform_decode=False,
+            uniform=False,
+        )
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=False, has_lora=False
         )
-        rt_mode, key = dispatcher.dispatch(desc_full_exact)
         if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_full_exact
@@ -122,32 +136,37 @@ def test_dispatcher(self, cudagraph_mode_str, compilation_mode, lora_config):
             assert rt_mode == CUDAGraphMode.NONE
 
         # 2. uniform decode batch, size in cudagraph size list
-        desc_uniform_exact = BatchDescriptor(num_tokens=8, uniform_decode=True)
-        rt_mode, key = dispatcher.dispatch(desc_uniform_exact)
+        desc_uniform_exact = BatchDescriptor(num_tokens=8, num_reqs=8, uniform=True)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=True, has_lora=False
+        )
         if cudagraph_mode_str == "FULL":
             assert rt_mode == CUDAGraphMode.FULL
-            assert key == desc_uniform_exact.non_uniform
+            assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs()
         elif cudagraph_mode_str in ["FULL_DECODE_ONLY", "FULL_AND_PIECEWISE"]:
             assert rt_mode == CUDAGraphMode.FULL
             assert key == desc_uniform_exact
         elif cudagraph_mode_str == "PIECEWISE":
             assert rt_mode == CUDAGraphMode.PIECEWISE
-            assert key == desc_uniform_exact.non_uniform
+            assert key == desc_uniform_exact.relax_for_mixed_batch_cudagraphs()
         else:
             assert rt_mode == CUDAGraphMode.NONE
 
         # 3. No key match
-        desc_no_match = BatchDescriptor(num_tokens=15, uniform_decode=False)
-        rt_mode, key = dispatcher.dispatch(desc_no_match)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=15, uniform_decode=False, has_lora=False
+        )
         assert rt_mode == CUDAGraphMode.NONE
-        assert key is None
+        assert key == BatchDescriptor(num_tokens=15)
 
         # 4. Cascade attention should have a fall back mode
-        desc_full_exact = BatchDescriptor(num_tokens=8, uniform_decode=False)
-        rt_mode, key = dispatcher.dispatch(desc_full_exact, use_cascade_attn=True)
+        desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False)
+        rt_mode, key = dispatcher.dispatch(
+            num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True
+        )
         if "PIECEWISE" in cudagraph_mode_str:  # string contains check
             assert rt_mode == CUDAGraphMode.PIECEWISE
-            assert key == desc_full_exact.non_uniform
+            assert key == desc_full_exact.relax_for_mixed_batch_cudagraphs()
         else:
             assert rt_mode == CUDAGraphMode.NONE
 
diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
index d6bde16eba36..12621d493e54 100644
--- a/tests/v1/cudagraph/test_cudagraph_mode.py
+++ b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -35,14 +35,22 @@ def temporary_environ(env_vars):
 
 # test attention backend and cudagraph_mode combo
 # (backend_name, cudagraph_mode, supported)
-combo_cases_1 = [
-    ("FA3", "FULL", True),
-    ("FA3", "FULL_AND_PIECEWISE", True),
-    ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FA2", "FULL_AND_PIECEWISE", True),
-    ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
-    ("FlashInfer", "FULL_AND_PIECEWISE", True),
-]
+if current_platform.is_rocm():
+    combo_cases_1 = [
+        ("RocmAttn", "FULL", True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", True),
+        ("TritonAttn", "FULL", True),
+        ("TritonAttn", "FULL_AND_PIECEWISE", True),
+    ]
+else:
+    combo_cases_1 = [
+        ("FA3", "FULL", True),
+        ("FA3", "FULL_AND_PIECEWISE", True),
+        ("FA2", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FA2", "FULL_AND_PIECEWISE", True),
+        ("FlashInfer", "FULL", True),  # Should fallback to FULL_AND_PIECEWISE
+        ("FlashInfer", "FULL_AND_PIECEWISE", True),
+    ]
 
 
 @pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
@@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte
 
 # test cudagraph_mode with different compilation mode.
 # (backend_name, cudagraph_mode, compilation_mode, supported)
-combo_cases_2 = [
-    ("FA2", "FULL", CompilationMode.NONE, True),
-    ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
-    ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
-    ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
-    ("FA2", "NONE", CompilationMode.NONE, True),
-    ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
-]
+if current_platform.is_rocm():
+    combo_cases_2 = [
+        ("RocmAttn", "FULL", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
+        ("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("RocmAttn", "NONE", CompilationMode.NONE, True),
+        ("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
+else:
+    combo_cases_2 = [
+        ("FA2", "FULL", CompilationMode.NONE, True),
+        ("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "PIECEWISE", CompilationMode.NONE, True),
+        ("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, True),
+        ("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
+        ("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
+        ("FA2", "NONE", CompilationMode.NONE, True),
+        ("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
+    ]
 
 
 @pytest.mark.parametrize(
diff --git a/tests/v1/determinism/conftest.py b/tests/v1/determinism/conftest.py
new file mode 100644
index 000000000000..bde02bbd0d5c
--- /dev/null
+++ b/tests/v1/determinism/conftest.py
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+import vllm.model_executor.layers.batch_invariant as batch_invariant
+
+
+@pytest.fixture(autouse=True)
+def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
+    """Automatically enable batch invariant kernel overrides for all tests."""
+    monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", True)
+    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
diff --git a/tests/v1/generation/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py
similarity index 90%
rename from tests/v1/generation/test_batch_invariance.py
rename to tests/v1/determinism/test_batch_invariance.py
index 8fd038bca5d0..4311547baccf 100644
--- a/tests/v1/generation/test_batch_invariance.py
+++ b/tests/v1/determinism/test_batch_invariance.py
@@ -6,73 +6,23 @@
 
 import pytest
 import torch
-
-from vllm import LLM, SamplingParams
-from vllm.platforms import current_platform
-
-skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
+from utils import (
+    BACKENDS,
+    _extract_step_logprobs,
+    _random_prompt,
+    resolve_model_name,
+    skip_unsupported,
 )
 
-
-@pytest.fixture(autouse=True)
-def enable_batch_invariant_mode(monkeypatch: pytest.MonkeyPatch):
-    """Automatically enable batch invariant kernel overrides for all tests."""
-    monkeypatch.setenv("VLLM_BATCH_INVARIANT", "1")
-    yield
-
-
-def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
-    # Generate more realistic prompts that will actually produce varied tokens
-    # Use a mix of common English text patterns
-
-    prompt_templates = [
-        # Question-answer style
-        "Question: What is the capital of France?\nAnswer: The capital of France is",
-        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
-        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
-        # Story/narrative style
-        "Once upon a time in a distant galaxy, there lived",
-        "The old man walked slowly down the street, remembering",
-        "In the year 2157, humanity finally discovered",
-        # Technical/code style
-        "To implement a binary search tree in Python, first we need to",
-        "The algorithm works by iterating through the array and",
-        "Here's how to optimize database queries using indexing:",
-        # Factual/informative style
-        "The Renaissance was a period in European history that",
-        "Climate change is caused by several factors including",
-        "The human brain contains approximately 86 billion neurons which",
-        # Conversational style
-        "I've been thinking about getting a new laptop because",
-        "Yesterday I went to the store and bought",
-        "My favorite thing about summer is definitely",
-    ]
-
-    # Pick a random template
-    base_prompt = random.choice(prompt_templates)
-
-    if max_words < min_words:
-        max_words = min_words
-    target_words = random.randint(min_words, max_words)
-
-    if target_words > 50:
-        # For longer prompts, repeat context
-        padding_text = (
-            " This is an interesting topic that deserves more explanation. "
-            * (target_words // 50)
-        )
-        base_prompt = base_prompt + padding_text
-
-    return base_prompt
+import vllm.model_executor.layers.batch_invariant as batch_invariant
+from vllm import LLM, SamplingParams
 
 
 @skip_unsupported
 @pytest.mark.timeout(1000)
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     backend, monkeypatch: pytest.MonkeyPatch
@@ -104,7 +54,7 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
     # Allow overrides from environment (useful for CI tuning)
     # "facebook/opt-125m" is too small, doesn't reliably test determinism
-    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model = resolve_model_name(backend)
     num_trials = int(os.getenv("VLLM_NEEDLE_TRIALS", "5"))
     max_batch_size = int(os.getenv("VLLM_NEEDLE_BATCH_SIZE", "128"))
     min_random_prompt = int(os.getenv("VLLM_MIN_PROMPT", "1024"))
@@ -204,28 +154,11 @@ def test_v1_generation_is_deterministic_across_batch_sizes_with_needle(
                 llm_bsN.shutdown()
 
 
-def _extract_step_logprobs(request_output):
-    if getattr(request_output, "outputs", None):
-        inner = request_output.outputs[0]
-        if hasattr(inner, "logprobs") and inner.logprobs is not None:
-            t = torch.tensor(
-                [
-                    inner.logprobs[i][tid].logprob
-                    for i, tid in enumerate(inner.token_ids)
-                ],
-                dtype=torch.float32,
-            )
-            return t, inner.token_ids
-
-    return None, None
-
-
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
-@pytest.mark.forked
 def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
@@ -233,7 +166,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     # For batch invariance, disable custom all-reduce to ensure deterministic
@@ -256,6 +189,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
         max_num_seqs=32,
         max_model_len=8192,
         dtype="bfloat16",  # not everything is supported
+        gpu_memory_utilization=0.9,
     )
 
     # Use more realistic prompts for better token generation
@@ -442,7 +376,7 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
 def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     """
@@ -450,7 +384,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
     Useful for quick smoke testing and debugging.
     """
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
-    model = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model = resolve_model_name(backend)
 
     llm = LLM(
         model=model,
@@ -492,9 +426,8 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch):
 @skip_unsupported
 @pytest.mark.parametrize(
     "backend",
-    ["FLASH_ATTN", "FLASHINFER", "FLASH_ATTN_MLA", "FLASHINFER_MLA", "TRITON_MLA"],
+    BACKENDS,
 )
-@pytest.mark.forked
 def test_logprobs_without_batch_invariance_should_fail(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
@@ -511,10 +444,10 @@ def test_logprobs_without_batch_invariance_should_fail(
 
     # CRITICAL: Disable batch invariance for this test
     monkeypatch.setenv("VLLM_BATCH_INVARIANT", "0")
-
+    monkeypatch.setattr(batch_invariant, "VLLM_BATCH_INVARIANT", False)
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     print(f"\n{'=' * 80}")
@@ -711,7 +644,6 @@ def test_logprobs_without_batch_invariance_should_fail(
 
 @skip_unsupported
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
-@pytest.mark.forked
 def test_decode_logprobs_match_prefill_logprobs(
     backend, monkeypatch: pytest.MonkeyPatch
 ):
@@ -732,7 +664,7 @@ def test_decode_logprobs_match_prefill_logprobs(
 
     seed = int(os.getenv("VLLM_TEST_SEED", "12345"))
     random.seed(seed)
-    model_name = os.getenv("VLLM_TEST_MODEL", "Qwen/Qwen3-1.7B")
+    model_name = resolve_model_name(backend)
     tp_size = int(os.getenv("VLLM_TEST_TP_SIZE", "1"))
 
     from vllm.model_executor.layers.batch_invariant import (
diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py
new file mode 100644
index 000000000000..d74b435797f8
--- /dev/null
+++ b/tests/v1/determinism/test_online_batch_invariance.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+HTTP-based batch invariance test: send requests to a running
+vLLM server and compare BS=1 vs BS=N results (tokens and per-step logprobs).
+
+Environment variables:
+  - VLLM_TEST_MODEL: served model name (e.g., Qwen/Qwen3-1.7B / DeepSeek-R1)
+  - VLLM_TP_SIZE: tensor parallelism size (e.g., 4)
+
+"""
+
+import os
+import random
+import sys
+from typing import Any
+
+import openai
+import pytest
+from utils import BACKENDS, _random_prompt, resolve_model_name, skip_unsupported
+
+from tests.utils import RemoteOpenAIServer
+
+
+def _request_completion(
+    client: openai.OpenAI,
+    model: str,
+    prompt: Any,
+    sp: dict[str, Any],
+    max_retries: int = 3,
+    retry_backoff: float = 0.5,
+) -> dict[str, Any] | None:
+    payload: dict[str, Any] = {"model": model, "prompt": prompt}
+    payload.update(sp)
+
+    for attempt in range(max_retries + 1):
+        try:
+            completion = client.completions.create(**payload)
+            # Convert to plain dict so downstream logic can keep using
+            # dict-style access just like with raw HTTP JSON.
+            return completion.model_dump()
+        except Exception as e:  # pragma: no cover
+            if attempt < max_retries:
+                import time as _t
+
+                _t.sleep(retry_backoff * (2**attempt))
+                continue
+            sys.stderr.write(f"Error: {e}\n")
+            return None
+    return None
+
+
+def _extract_tokens_and_logprobs(
+    choice: dict[str, Any],
+) -> tuple[list[Any], list[float] | None]:
+    tokens: list[Any] = []
+    token_logprobs: list[float] | None = None
+    lp = choice.get("logprobs")
+    if lp and isinstance(lp, dict):
+        tokens = lp.get("token_ids") or lp.get("tokens") or []
+        token_logprobs = lp.get("token_logprobs", None)
+    return tokens, token_logprobs
+
+
+def _compare_bs1_vs_bsn_single_process(
+    prompts: list[str],
+    sp_kwargs: dict[str, Any],
+    client: openai.OpenAI,
+    model_name: str,
+) -> None:
+    # BS=1
+    bs1_tokens_per_prompt: list[list[Any]] = []
+    bs1_logprobs_per_prompt: list[list[float] | None] = []
+    for p in prompts:
+        resp = _request_completion(client, model_name, p, sp_kwargs)
+        if resp is None or not resp.get("choices"):
+            raise AssertionError("BS=1 empty/failed response")
+        choice = resp["choices"][0]
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(
+                "logprobs not returned; ensure server supports 'logprobs'"
+            )
+        bs1_tokens_per_prompt.append(list(toks))
+        bs1_logprobs_per_prompt.append(list(lps))
+
+    # BS=N
+    bsN_tokens_per_prompt: list[list[Any]] = [None] * len(prompts)  # type: ignore[list-item]
+    bsN_logprobs_per_prompt: list[list[float] | None] = [None] * len(prompts)
+    resp = _request_completion(client, model_name, prompts, sp_kwargs)
+    if resp is None or not resp.get("choices"):
+        raise AssertionError("BS=N empty/failed batched response")
+    choices = resp.get("choices", [])
+    if len(choices) != len(prompts):
+        raise AssertionError(
+            f"BS=N choices length {len(choices)} != num prompts {len(prompts)}"
+        )
+    for idx, choice in enumerate(choices):
+        toks, lps = _extract_tokens_and_logprobs(choice)
+        if lps is None:
+            raise AssertionError(f"BS=N missing logprobs for prompt {idx}")
+        bsN_tokens_per_prompt[idx] = list(toks)
+        bsN_logprobs_per_prompt[idx] = list(lps)
+
+    # compare
+    for i, (tokens_bs1, tokens_bsN, logprobs_bs1, logprobs_bsN) in enumerate(
+        zip(
+            bs1_tokens_per_prompt,
+            bsN_tokens_per_prompt,
+            bs1_logprobs_per_prompt,
+            bsN_logprobs_per_prompt,
+        )
+    ):
+        if tokens_bs1 != tokens_bsN:
+            raise AssertionError(
+                f"Prompt {i} (sampling): Different tokens sampled. "
+                f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+            )
+        if logprobs_bs1 is None or logprobs_bsN is None:
+            raise AssertionError(f"Prompt {i}: Missing logprobs in one of the runs")
+        if len(logprobs_bs1) != len(logprobs_bsN):
+            raise AssertionError(
+                f"Prompt {i}: Different number of steps: "
+                f"{len(logprobs_bs1)} (BS=1) vs {len(logprobs_bsN)} (BS=N)."
+            )
+        for t, (a, b) in enumerate(zip(logprobs_bs1, logprobs_bsN)):
+            if a != b:
+                diff = abs(a - b)
+                raise AssertionError(
+                    f"Prompt {i} Step {t}: Bitwise mismatch "
+                    f"(abs diff={diff:.6e}). "
+                    f"BS=1 tokens: {tokens_bs1} BS=N tokens: {tokens_bsN}"
+                )
+
+
+@skip_unsupported
+@pytest.mark.parametrize("backend", BACKENDS)
+def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN(
+    backend: str, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    random.seed(int(os.getenv("VLLM_TEST_SEED", "12345")))
+    # Override backend for this test (and the RemoteOpenAIServer child process).
+    monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend)
+    model_name = resolve_model_name(backend)
+    prompts_all = [_random_prompt(10, 50) for _ in range(32)]
+
+    sp_kwargs: dict[str, Any] = {
+        "temperature": 0.6,
+        "top_p": 1.0,
+        "max_tokens": 8,
+        "seed": 42,
+        "logprobs": 5,
+    }
+
+    tp_size = os.getenv("VLLM_TP_SIZE", "1")
+    server_args: list[str] = []
+    if tp_size:
+        server_args += ["-tp", tp_size]
+
+    with RemoteOpenAIServer(model_name, server_args) as server:
+        client = server.get_client()
+        _compare_bs1_vs_bsn_single_process(
+            prompts=prompts_all,
+            sp_kwargs=sp_kwargs,
+            client=client,
+            model_name=model_name,
+        )
diff --git a/tests/v1/generation/test_rms_norm_batch_invariant.py b/tests/v1/determinism/test_rms_norm_batch_invariant.py
similarity index 97%
rename from tests/v1/generation/test_rms_norm_batch_invariant.py
rename to tests/v1/determinism/test_rms_norm_batch_invariant.py
index f79eba58d6ef..390872519528 100644
--- a/tests/v1/generation/test_rms_norm_batch_invariant.py
+++ b/tests/v1/determinism/test_rms_norm_batch_invariant.py
@@ -9,15 +9,10 @@
 
 import pytest
 import torch
+from utils import skip_unsupported
 
 from vllm.model_executor.layers.batch_invariant import rms_norm as triton_rms_norm
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
-
-skip_unsupported = pytest.mark.skipif(
-    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
-    reason="Requires CUDA and >= Hopper (SM90)",
-)
 
 
 @skip_unsupported
diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py
new file mode 100644
index 000000000000..0d7da107728b
--- /dev/null
+++ b/tests/v1/determinism/utils.py
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+import random
+
+import pytest
+import torch
+
+from vllm.attention.utils.fa_utils import flash_attn_supports_mla
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
+
+skip_unsupported = pytest.mark.skipif(
+    not (current_platform.is_cuda() and current_platform.has_device_capability(90)),
+    reason="Requires CUDA and >= Hopper (SM90)",
+)
+
+BACKENDS: list[str] = [
+    "FLASH_ATTN",
+]
+
+if has_flashinfer():
+    BACKENDS.append("FLASHINFER")
+
+if flash_attn_supports_mla():
+    BACKENDS.append("FLASH_ATTN_MLA")
+
+DEFAULT_MODEL = "Qwen/Qwen3-1.7B"
+MLA_MODEL = "deepseek-ai/DeepSeek-V2-Lite-Chat"
+
+
+def resolve_model_name(backend: str) -> str:
+    """Resolve the model name for the given backend."""
+    model = os.getenv("VLLM_TEST_MODEL", DEFAULT_MODEL)
+    if backend.endswith("MLA") and model == DEFAULT_MODEL:
+        return MLA_MODEL
+    return model
+
+
+def _random_prompt(min_words: int = 1024, max_words: int = 1024 * 2) -> str:
+    # Generate more realistic prompts that will actually produce varied tokens
+    # Use a mix of common English text patterns
+
+    prompt_templates = [
+        # Question-answer style
+        "Question: What is the capital of France?\nAnswer: The capital of France is",
+        "Q: How does photosynthesis work?\nA: Photosynthesis is the process by which",
+        "User: Can you explain quantum mechanics?\nAssistant: Quantum mechanics is",
+        # Story/narrative style
+        "Once upon a time in a distant galaxy, there lived",
+        "The old man walked slowly down the street, remembering",
+        "In the year 2157, humanity finally discovered",
+        # Technical/code style
+        "To implement a binary search tree in Python, first we need to",
+        "The algorithm works by iterating through the array and",
+        "Here's how to optimize database queries using indexing:",
+        # Factual/informative style
+        "The Renaissance was a period in European history that",
+        "Climate change is caused by several factors including",
+        "The human brain contains approximately 86 billion neurons which",
+        # Conversational style
+        "I've been thinking about getting a new laptop because",
+        "Yesterday I went to the store and bought",
+        "My favorite thing about summer is definitely",
+    ]
+
+    # Pick a random template
+    base_prompt = random.choice(prompt_templates)
+
+    if max_words < min_words:
+        max_words = min_words
+    target_words = random.randint(min_words, max_words)
+
+    if target_words > 50:
+        # For longer prompts, repeat context
+        padding_text = (
+            " This is an interesting topic that deserves more explanation. "
+            * (target_words // 50)
+        )
+        base_prompt = base_prompt + padding_text
+
+    return base_prompt
+
+
+def _extract_step_logprobs(request_output):
+    if getattr(request_output, "outputs", None):
+        inner = request_output.outputs[0]
+        if hasattr(inner, "logprobs") and inner.logprobs is not None:
+            t = torch.tensor(
+                [
+                    inner.logprobs[i][tid].logprob
+                    for i, tid in enumerate(inner.token_ids)
+                ],
+                dtype=torch.float32,
+            )
+            return t, inner.token_ids
+
+    return None, None
diff --git a/tests/v1/distributed/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py
index 60f9017184ea..3b5f2e5e8d72 100644
--- a/tests/v1/distributed/test_async_llm_dp.py
+++ b/tests/v1/distributed/test_async_llm_dp.py
@@ -12,6 +12,7 @@
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import PromptType
+from vllm.platforms import current_platform
 from vllm.sampling_params import RequestOutputKind
 from vllm.v1.engine.async_llm import AsyncLLM
 from vllm.v1.engine.core_client import DPAsyncMPClient
@@ -84,6 +85,10 @@ async def test_load(
     if async_scheduling and data_parallel_backend == "ray":
         # TODO(NickLucche) Re-enable when async scheduling is supported
         pytest.skip("Async scheduling is not supported with ray")
+    elif data_parallel_backend == "ray" and current_platform.is_rocm():
+        pytest.skip(
+            "Ray as the distributed executor backend is not supported with ROCm."
+        )
     stats_loggers = {}
 
     @dataclass
diff --git a/tests/v1/distributed/test_dbo.py b/tests/v1/distributed/test_dbo.py
index 866ae742bf3c..16f154d196ba 100644
--- a/tests/v1/distributed/test_dbo.py
+++ b/tests/v1/distributed/test_dbo.py
@@ -85,5 +85,4 @@ def test_dbo_dp_ep_gsm8k(all2all_backend: str, num_gpus_available):
         assert accuracy >= MIN_ACCURACY, (
             f"DBO+DP+EP accuracy too low ({all2all_backend}): "
             f"{accuracy:.3f} < {MIN_ACCURACY:.3f} "
-            f"(correct: {results['num_correct']}/{results['num_questions']})"
         )
diff --git a/tests/v1/distributed/test_eagle_dp.py b/tests/v1/distributed/test_eagle_dp.py
new file mode 100644
index 000000000000..9f6a6614fc1f
--- /dev/null
+++ b/tests/v1/distributed/test_eagle_dp.py
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import os
+from contextlib import AsyncExitStack
+from dataclasses import replace
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine.async_llm import AsyncLLM
+
+DP_SIZE = int(os.getenv("DP_SIZE", 2))
+
+
+@pytest.mark.asyncio
+async def test_run_eagle_dp():
+    target_model = "meta-llama/Llama-3.1-8B-Instruct"
+    draft_model = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+    engine_args = AsyncEngineArgs(
+        model=target_model,
+        tokenizer_mode="auto",
+        enforce_eager=False,
+        tensor_parallel_size=int(os.getenv("TP_SIZE", 1)),
+        data_parallel_size=DP_SIZE,
+        data_parallel_backend="mp",  # ray takes more time
+        trust_remote_code=True,
+        max_model_len=16384,
+    )
+
+    eagle_engine_args = replace(
+        engine_args,
+        speculative_config={
+            "model": draft_model,
+            "method": "eagle",
+            "num_speculative_tokens": 3,
+        },
+    )
+
+    prompt = "This is a test of data parallel with eagle"
+    num_expected_tokens = 100
+    sampling_params = SamplingParams(
+        min_tokens=num_expected_tokens,
+        max_tokens=num_expected_tokens,
+        ignore_eos=True,
+        output_kind=RequestOutputKind.FINAL_ONLY,
+        temperature=0,
+    )
+
+    async def generate_with_timeout(given_engine: AsyncLLM):
+        async for out in given_engine.generate(
+            request_id="test-eagle-dp", prompt=prompt, sampling_params=sampling_params
+        ):
+            token_ids = out.outputs[0].token_ids
+            assert len(token_ids) == num_expected_tokens
+            return token_ids
+
+    async def engine_create_and_generate(engine_args: AsyncEngineArgs):
+        async with AsyncExitStack() as after:
+            engine = AsyncLLM.from_engine_args(engine_args)
+            after.callback(engine.shutdown)
+
+            token_ids = await asyncio.wait_for(
+                generate_with_timeout(engine), timeout=30
+            )
+
+            assert not engine.output_processor.has_unfinished_requests()
+        return token_ids
+
+    token_ids_with_eagle = await engine_create_and_generate(eagle_engine_args)
+    token_ids_no_eagle = await engine_create_and_generate(engine_args)
+
+    # Test for correctness
+    assert token_ids_with_eagle == token_ids_no_eagle
diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py
index 444afd5196dd..945276376d66 100644
--- a/tests/v1/e2e/test_async_scheduling.py
+++ b/tests/v1/e2e/test_async_scheduling.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from itertools import repeat
 from typing import Any
 
 import pytest
@@ -8,126 +9,282 @@
 from vllm import SamplingParams
 from vllm.logprobs import Logprob
 from vllm.sampling_params import StructuredOutputsParams
+from vllm.v1.metrics.reader import Metric
 
 from ...conftest import VllmRunner
 from ...models.utils import check_outputs_equal
 
 MODEL = "Qwen/Qwen3-0.6B"
+MTP_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
 
 
-@dynamo_config.patch(cache_size_limit=16)
-def test_preempt_and_async_scheduling_e2e(
-    sample_json_schema, monkeypatch: pytest.MonkeyPatch
-):
-    """Test consistency of combos of async scheduling, preemption,
-    uni/multiproc executor, and various sampling parameters
-    including structured outputs."""
+first_prompt = (
+    "The following numbers of the sequence "
+    + ", ".join(str(i) for i in range(10))
+    + " are:"
+)
+example_prompts = [first_prompt, "In one word, the capital of France is "] + [
+    f"Tell me about the number {i}: " for i in range(32)
+]
+
+default_params = dict(
+    temperature=0.0,  # greedy
+    max_tokens=23,
+    min_tokens=18,
+)
 
-    first_prompt = (
-        "The following numbers of the sequence "
-        + ", ".join(str(i) for i in range(10))
-        + " are:"
-    )
-    example_prompts = [first_prompt, "In one word, the capital of France is "] + [
-        f"Tell me about the number {i}: " for i in range(32)
-    ]
 
-    sampling_param_tests: list[dict[str, Any]] = [
+def test_without_spec_decoding(
+    sample_json_schema,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor, prefill chunking."""
+    struct_outputs = StructuredOutputsParams(json=sample_json_schema)
+    test_sampling_params: list[dict[str, Any]] = [
         dict(),
         # dict(min_tokens=20),
         dict(presence_penalty=-1.0),
         dict(bad_words=["the", " the"]),
         dict(logprobs=2),
         dict(logprobs=2, presence_penalty=-1.0),
-        dict(structured_outputs=StructuredOutputsParams(json=sample_json_schema)),
+        dict(structured_outputs=struct_outputs),
         dict(
-            structured_outputs=StructuredOutputsParams(json=sample_json_schema),
+            structured_outputs=struct_outputs,
             logprobs=2,
             presence_penalty=-1.0,
         ),
     ]
 
-    default_params = dict(
-        temperature=0.0,  # greedy
-        max_tokens=20,
-    )
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (True, "mp", False, None, True),
+        (False, "mp", True, None, False),
+        (False, "uni", True, None, False),
+        (True, "mp", True, None, False),
+        (True, "uni", True, None, False),
+        (False, "mp", True, None, True),
+        (True, "mp", True, None, True),
+        (True, "uni", True, None, True),
+    ]
+
+    run_tests(monkeypatch, MODEL, test_configs, test_sampling_params)
+
+
+def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
+    """Test consistency and acceptance rates with some different combos of
+    preemption, executor, async scheduling, prefill chunking,
+    spec decoding model length.
+    """
+
+    spec_config = {
+        "method": "eagle3",
+        "num_speculative_tokens": 2,
+        "model": "nm-testing/Llama3_2_1B_speculator.eagle3",
+    }
+    # Set small draft model len to force doesn't-fit-in-drafter case.
+    spec_config_short = spec_config | {"max_model_len": 50}
+
+    test_sampling_params = [
+        dict(),
+        dict(logprobs=2),
+    ]
+
+    # test_preemption, executor, async_scheduling,
+    # spec_config, test_prefill_chunking
+    test_configs = [
+        (False, "mp", False, None, False),
+        (False, "mp", False, spec_config, False),
+        (True, "mp", False, spec_config, True),
+        (True, "uni", False, spec_config_short, True),
+        (False, "mp", True, spec_config, False),
+        (True, "mp", True, spec_config, False),
+        (False, "mp", True, spec_config_short, True),
+        (True, "uni", True, spec_config, False),
+        (True, "uni", True, spec_config_short, False),
+        (True, "mp", True, spec_config, True),
+        (True, "uni", True, spec_config_short, True),
+    ]
+
+    run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params)
+
+
+@dynamo_config.patch(cache_size_limit=16)
+def run_tests(
+    monkeypatch: pytest.MonkeyPatch,
+    model: str,
+    test_configs: list[tuple],
+    test_sampling_params: list[dict[str, Any]],
+):
+    """Test consistency of combos of async scheduling, preemption,
+    uni/multiproc executor with spec decoding."""
 
     with monkeypatch.context() as m:
+        # avoid precision errors
         m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
         # m.setenv("VLLM_BATCH_INVARIANT", "1")
+        outputs: list[tuple[str, list, list]] = []
+        for n, (
+            test_preemption,
+            executor,
+            async_scheduling,
+            spec_config,
+            test_prefill_chunking,
+        ) in enumerate(test_configs, 1):
+            test_str = f"{n}/{len(test_configs)}"
+            test_results = run_test(
+                model,
+                test_str,
+                test_sampling_params,
+                test_preemption,
+                executor,
+                async_scheduling,
+                spec_config,
+                test_prefill_chunking=test_prefill_chunking,
+            )
+            outputs.append(test_results)
 
-        outputs: list[tuple[str, list]] = []
-        for test_preemption in [False, True]:
-            for executor in ["mp", "uni"]:
-                for async_scheduling in [False, True]:
-                    cache_arg: dict[str, Any] = (
-                        dict(num_gpu_blocks_override=32)
-                        if test_preemption
-                        else dict(gpu_memory_utilization=0.7)
-                    )
-                    test_config = (
-                        f"executor={executor}, preemption={test_preemption},"
-                        f" async_sched={async_scheduling}"
-                    )
-                    print("-" * 80)
-                    print(f"---- TESTING: {test_config}")
-                    print("-" * 80)
-                    with VllmRunner(
-                        MODEL,
-                        max_model_len=512,
-                        enforce_eager=True,
-                        async_scheduling=async_scheduling,
-                        distributed_executor_backend=executor,
-                        dtype="float32",  # avoid precision errors
-                        **cache_arg,
-                    ) as vllm_model:
-                        results = []
-                        for override_params in sampling_param_tests:
-                            print(f"----------- RUNNING PARAMS: {override_params}")
-                            results.append(
-                                vllm_model.generate(
-                                    example_prompts,
-                                    sampling_params=SamplingParams(
-                                        **default_params, **override_params
-                                    ),
-                                    return_logprobs=True,
-                                )
-                            )
-
-                        if not outputs:
-                            # First check that the different parameter configs
-                            # actually result in different output.
-                            for (other_test_outs, other_test_logprobs), params in zip(
-                                results[1:], sampling_param_tests[1:]
-                            ):
-                                with pytest.raises(AssertionError):
-                                    check_outputs_equal(
-                                        outputs_0_lst=results[0][0],
-                                        outputs_1_lst=other_test_outs,
-                                        name_0=f"baseline params={params}",
-                                        name_1=f"other params={params}",
-                                    )
-                                    assert _all_logprobs_match(
-                                        results[0][1], other_test_logprobs
-                                    )
-
-                        outputs.append((test_config, results))
-
-    baseline_config, baseline_tests = outputs[0]
-
-    for test_config, test_outputs in outputs[1:]:
-        for (base_outs, base_logprobs), (test_outs, test_logprobs), params in zip(
-            baseline_tests, test_outputs, sampling_param_tests
+    baseline_config, baseline_tests, _ = outputs[0]
+    _, _, baseline_acceptances = next(
+        (o for o in outputs if o[2] is not None), (None, None, None)
+    )
+
+    print(f"BASELINE: config=[{baseline_config}], accept_rates={baseline_acceptances}")
+
+    failure = None
+    for test_config, test_outputs, test_acceptance_rates in outputs[1:]:
+        for (base_outs, base_logprobs), base_acceptance_rate, (
+            test_outs,
+            test_logprobs,
+        ), test_acceptance_rate, params in zip(
+            baseline_tests,
+            baseline_acceptances or repeat(None),
+            test_outputs,
+            test_acceptance_rates or repeat(None),
+            test_sampling_params,
         ):
-            check_outputs_equal(
-                outputs_0_lst=base_outs,
-                outputs_1_lst=test_outs,
-                name_0=f"baseline=[{baseline_config}], params={params}",
-                name_1=f"config=[{test_config}], params={params}",
+            try:
+                check_outputs_equal(
+                    outputs_0_lst=base_outs,
+                    outputs_1_lst=test_outs,
+                    name_0=f"baseline=[{baseline_config}], params={params}",
+                    name_1=f"config=[{test_config}], params={params}",
+                )
+                assert _all_logprobs_match(base_logprobs, test_logprobs)
+
+                if (
+                    base_acceptance_rate is not None
+                    and test_acceptance_rate is not None
+                ):
+                    if "spec_mml=None" in test_config:
+                        assert (
+                            test_acceptance_rate > base_acceptance_rate
+                            or test_acceptance_rate
+                            == pytest.approx(base_acceptance_rate, rel=5e-2)
+                        )
+                    else:
+                        # Currently the reported acceptance rate is expected to be
+                        # lower when we sometimes skip drafting altogether.
+                        assert test_acceptance_rate > 0.1
+                print(
+                    f"PASSED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+            except AssertionError as e:
+                print(
+                    f"FAILED: config=[{test_config}], params={params}"
+                    f" accept_rate={test_acceptance_rate}"
+                )
+                if failure is None:
+                    failure = e
+
+    if failure is not None:
+        raise failure
+
+
+def run_test(
+    model: str,
+    test_str: str,
+    sampling_param_tests: list[dict[str, Any]],
+    test_preemption: bool,
+    executor: str,
+    async_scheduling: bool,
+    spec_config: dict[str, Any] | None,
+    test_prefill_chunking: bool,
+):
+    spec_decoding = spec_config is not None
+    cache_arg: dict[str, Any] = (
+        # Force preemptions
+        dict(num_gpu_blocks_override=32)
+        if test_preemption
+        else dict(gpu_memory_utilization=0.9)
+    )
+    spec_mml = (spec_config or {}).get("max_model_len")
+    test_config = (
+        f"executor={executor}, preemption={test_preemption}, "
+        f"async_sched={async_scheduling}, "
+        f"chunk_prefill={test_prefill_chunking}, "
+        f"spec_decoding={spec_decoding}, spec_mml={spec_mml}"
+    )
+    print("-" * 80)
+    print(f"---- TESTING {test_str}: {test_config}")
+    print("-" * 80)
+    with VllmRunner(
+        model,
+        max_model_len=512,
+        enable_chunked_prefill=test_prefill_chunking,
+        # Force prefill chunking
+        max_num_batched_tokens=48 if test_prefill_chunking else None,
+        # enforce_eager=True,
+        async_scheduling=async_scheduling,
+        distributed_executor_backend=executor,
+        dtype="float32",  # avoid precision errors
+        speculative_config=spec_config,
+        disable_log_stats=False,
+        **cache_arg,
+    ) as vllm_model:
+        results = []
+        acceptance_rates: list[float] | None = [] if spec_decoding else None
+        for override_params in sampling_param_tests:
+            metrics_before = vllm_model.llm.get_metrics()
+            print(f"----------- RUNNING PARAMS: {override_params}")
+            results.append(
+                vllm_model.generate(
+                    example_prompts,
+                    sampling_params=SamplingParams(**default_params, **override_params),
+                    return_logprobs=True,
+                )
             )
-            assert _all_logprobs_match(base_logprobs, test_logprobs)
+            metrics_after = vllm_model.llm.get_metrics()
+            if acceptance_rates is not None:
+                acceptance_rate = _get_acceptance_rate(metrics_before, metrics_after)
+                acceptance_rates.append(acceptance_rate)
+                print(f"ACCEPTANCE RATE {acceptance_rate}")
+
+            if test_preemption:
+                preemptions = _get_count(
+                    metrics_before, metrics_after, "vllm:num_preemptions"
+                )
+                assert preemptions > 0, "preemption test had no preemptions"
 
-            print(f"PASSED: config=[{test_config}], params={params}")
+    if len(results) > 1:
+        # First check that the different parameter configs
+        # actually result in different output.
+        for (other_test_outs, other_test_logprobs), params in zip(
+            results[1:], sampling_param_tests[1:]
+        ):
+            with pytest.raises(AssertionError):
+                check_outputs_equal(
+                    outputs_0_lst=results[0][0],
+                    outputs_1_lst=other_test_outs,
+                    name_0=f"baseline params={params}",
+                    name_1=f"other params={params}",
+                )
+                assert _all_logprobs_match(results[0][1], other_test_logprobs)
+
+    return test_config, results, acceptance_rates
 
 
 def _all_logprobs_match(req_a, req_b) -> bool:
@@ -149,3 +306,15 @@ def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> boo
         and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6)
         for a, b in ((lps_a[x], lps_b[x]) for x in lps_a)
     )
+
+
+def _get_acceptance_rate(before: list[Metric], after: list[Metric]) -> float:
+    draft = _get_count(before, after, "vllm:spec_decode_num_draft_tokens")
+    accept = _get_count(before, after, "vllm:spec_decode_num_accepted_tokens")
+    return accept / draft if draft > 0 else 0.0
+
+
+def _get_count(before: list[Metric], after: list[Metric], name: str) -> int:
+    before_val = next(m.value for m in before if m.name == name)
+    after_val = next(m.value for m in after if m.name == name)
+    return after_val - before_val
diff --git a/tests/v1/e2e/test_context_length.py b/tests/v1/e2e/test_context_length.py
new file mode 100644
index 000000000000..0ac40bec35fe
--- /dev/null
+++ b/tests/v1/e2e/test_context_length.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Tests for vLLM `vllm/v1/engine/processor.Processor._validate_model_input()`
+handling of maximum context length for decoder models.
+
+This test ensures:
+- A prompt that is one token shorter than the model's maximum context length
+  can be processed successfully when requesting one additional token.
+- A prompt that reaches the model's maximum context length throws a
+  `ValueError` when requesting at least one additional token.
+"""
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model, max_model_len", [("JackFram/llama-160m", 2048)])
+@pytest.mark.parametrize(
+    "prompt_len, max_tokens",
+    [
+        (2047, 1),  # prompt_len = max_model_len - 1 -> allowed
+        (2048, 1),  # prompt_len = max_model_len -> not allowed
+    ],
+)
+def test_decoder_max_context_length_validation(
+    model: str,
+    max_model_len: int,
+    vllm_runner: type[VllmRunner],
+    prompt_len: int,
+    max_tokens: int,
+) -> None:
+    """Check vLLM decoder model input validation for edge cases where
+    the prompt length is (almost) equal to the max model length."""
+
+    prompt_ids = [[43] * prompt_len]
+
+    with vllm_runner(
+        model_name=model,
+        tokenizer_name=model,
+        max_model_len=max_model_len,
+        max_num_seqs=1,
+        tensor_parallel_size=1,
+    ) as vllm_model:
+        if prompt_len + max_tokens <= max_model_len:
+            # Should succeed as constraints are met
+            vllm_model.generate_greedy(prompt_ids, max_tokens)
+        else:
+            # Should raise the ValueError defined in
+            # vllm/v1/engine/processor.Processor_validate_model_input()
+            expected_msg = (
+                f"The decoder prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than "
+                f"the maximum model length of {max_model_len}. "
+                "Make sure that `max_model_len` is no smaller than the number of "
+                "text tokens (prompt + requested output tokens)."
+            )
+            with pytest.raises(ValueError) as excinfo:
+                vllm_model.generate_greedy(prompt_ids, max_tokens)
+            assert expected_msg in str(excinfo.value)
diff --git a/tests/v1/e2e/test_kv_sharing_fast_prefill.py b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
index f2c6d1c1fd1a..2778b0c5e567 100644
--- a/tests/v1/e2e/test_kv_sharing_fast_prefill.py
+++ b/tests/v1/e2e/test_kv_sharing_fast_prefill.py
@@ -4,13 +4,11 @@
 import random
 
 import pytest
-import torch
 
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig, CompilationMode
-from vllm.distributed import cleanup_dist_env_and_memory
 
-from ...utils import fork_new_process_for_each_test
+from ...utils import check_answers, fork_new_process_for_each_test, prep_prompts
 
 # global seed
 SEED = 42
@@ -45,28 +43,12 @@ def test_prompts():
     return prompts
 
 
-def cleanup(llm: LLM, compilation_config: CompilationConfig):
-    # hacky: below lines are required to free up memory for the next test
-    # when setting VLLM_ENABLE_V1_MULTIPROCESSING=0, del llm is not sufficient
-    # TODO(sarckk): when enforce_eager=False, memory is not freed:
-    # find out why and re-enable test for enforce_eager=False case
-    llm_engine = llm.llm_engine.engine_core.engine_core
-    model_runner = llm_engine.model_executor.driver_worker.worker.model_runner
-    del model_runner.model
-    del model_runner.kv_caches
-    del compilation_config.static_forward_context
-    compilation_config.static_forward_context = {}
-
-    del llm
-    torch.cuda.empty_cache()
-    cleanup_dist_env_and_memory()
-
-
 @fork_new_process_for_each_test
-@pytest.mark.parametrize("enforce_eager", [True])
-@pytest.mark.skip(reason="Disable until Gemma3n supports fast prefill")
+@pytest.mark.parametrize("kv_sharing_fast_prefill", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True, False])
 def test_kv_sharing_fast_prefill(
     monkeypatch: pytest.MonkeyPatch,
+    kv_sharing_fast_prefill: bool,
     enforce_eager: bool,
     test_prompts: list[str],
 ):
@@ -79,36 +61,25 @@ def test_kv_sharing_fast_prefill(
         if not enforce_eager
         else CompilationMode.NONE,
     )
+    batch_size = 10
 
     with monkeypatch.context() as m:
         # Make scheduling deterministic for reproducibility
         m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
 
-        llm = LLM(
-            model="google/gemma-3n-E2B-it",
-            enforce_eager=enforce_eager,
-            compilation_config=compilation_config,
-            seed=SEED,
-        )
-        ref_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
+        prompts, answer, indices = prep_prompts(batch_size)
 
         llm = LLM(
             model="google/gemma-3n-E2B-it",
             enforce_eager=enforce_eager,
             compilation_config=compilation_config,
             seed=SEED,
-            kv_sharing_fast_prefill=True,
+            kv_sharing_fast_prefill=kv_sharing_fast_prefill,
+        )
+        responses = llm.generate(prompts, sampling_params)
+        check_answers(
+            indices,
+            answer,
+            [response.outputs[0].text for response in responses],
+            accept_rate=1.0,
         )
-        optimized_responses = llm.generate(test_prompts, sampling_params)
-
-        cleanup(llm, compilation_config)
-
-        misses = 0
-
-        for ref_response, optimized_response in zip(ref_responses, optimized_responses):
-            if ref_response.outputs[0].text != optimized_response.outputs[0].text:
-                misses += 1
-
-        assert misses == 0
diff --git a/tests/v1/e2e/test_lora_with_spec_decode.py b/tests/v1/e2e/test_lora_with_spec_decode.py
index 14532f279544..8c9ab58c3c0a 100644
--- a/tests/v1/e2e/test_lora_with_spec_decode.py
+++ b/tests/v1/e2e/test_lora_with_spec_decode.py
@@ -61,8 +61,6 @@ def test_batch_inference_correctness(
     model_setup: (method, model_name, spec_model_name, lora_path, tp_size)
     """
     with monkeypatch.context() as m:
-        m.setenv("VLLM_USE_V1", "1")
-
         # Disable randomness
         m.setenv("CUBLAS_WORKSPACE_CONFIG", ":4096:8")
         torch.manual_seed(SEED)
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
index 4a6b84ae4817..3a25f7411eec 100644
--- a/tests/v1/e2e/test_spec_decode.py
+++ b/tests/v1/e2e/test_spec_decode.py
@@ -75,6 +75,14 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
+@pytest.fixture(autouse=True)
+def reset_torch_dynamo():
+    """Reset torch dynamo cache before each test"""
+    yield
+    # Cleanup after test
+    torch._dynamo.reset()
+
+
 @pytest.mark.parametrize(
     "speculative_config",
     [
@@ -272,9 +280,22 @@ def test_speculators_model_integration(
 
 
 @pytest.mark.parametrize(
-    ["model_setup", "mm_enabled", "chunked_prefill_enabled"],
+    ["model_setup", "mm_enabled", "enable_chunked_prefill"],
     [
         (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False),
+        pytest.param(
+            (
+                "eagle3",
+                "Qwen/Qwen3-VL-8B-Instruct",
+                "taobao-mnn/Qwen3-VL-8B-Instruct-Eagle3",
+                1,
+            ),
+            False,
+            False,
+            marks=pytest.mark.skip(
+                reason="architecture of its eagle3 is LlamaForCausalLMEagle3"
+            ),
+        ),
         pytest.param(
             (
                 "eagle3",
@@ -344,6 +365,7 @@ def test_speculators_model_integration(
     ],
     ids=[
         "qwen3_eagle3",
+        "qwen3_vl_eagle3",
         "qwen2_5_vl_eagle3",
         "llama3_eagle",
         "llama3_eagle3",
@@ -358,7 +380,7 @@ def test_eagle_correctness(
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
-    chunked_prefill_enabled: bool,
+    enable_chunked_prefill: bool,
     attn_backend: str,
 ):
     if attn_backend == "TREE_ATTN":
@@ -396,9 +418,7 @@ def test_eagle_correctness(
 
         method, model_name, spec_model_name, tp_size = model_setup
         max_model_len = 2048
-        max_num_batched_tokens = max_model_len
-        if chunked_prefill_enabled:
-            max_num_batched_tokens = 128
+        max_num_batched_tokens = 128 if enable_chunked_prefill else max_model_len
 
         ref_llm = LLM(
             model=model_name, max_model_len=max_model_len, tensor_parallel_size=tp_size
@@ -420,7 +440,7 @@ def test_eagle_correctness(
             },
             max_model_len=max_model_len,
             max_num_batched_tokens=max_num_batched_tokens,
-            enable_chunked_prefill=chunked_prefill_enabled,
+            enable_chunked_prefill=enable_chunked_prefill,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/tests/v1/ec_connector/integration/README.md b/tests/v1/ec_connector/integration/README.md
index 30426e055ade..2dbcb307fda3 100644
--- a/tests/v1/ec_connector/integration/README.md
+++ b/tests/v1/ec_connector/integration/README.md
@@ -113,7 +113,7 @@ Quick sanity check:
 
 - Outputs differ between baseline and disagg
 - Server startup fails
-- Encoder cache not found (should fallback to local execution)
+- Encoder cache not found (should fall back to local execution)
 - Proxy routing errors
 
 ## Notes
diff --git a/tests/v1/ec_connector/integration/test_epd_correctness.py b/tests/v1/ec_connector/integration/test_epd_correctness.py
index 69c4c58e349b..616d34441ab8 100644
--- a/tests/v1/ec_connector/integration/test_epd_correctness.py
+++ b/tests/v1/ec_connector/integration/test_epd_correctness.py
@@ -237,9 +237,8 @@ def main():
 
     for i, prompt_data in enumerate(test_prompts):
         print(
-            f"\nRunning prompt {i + 1}/{len(test_prompts)}: {
-                prompt_data['description']
-            }"
+            f"\nRunning prompt {i + 1}/{len(test_prompts)}: "
+            f"{prompt_data['description']}"
         )
 
         output_str = run_chat_completion(
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 4e852dca95eb..3ba8ab26f552 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -571,7 +571,7 @@ def test_encoder_instance_zero_kv_cache(
         )
 
         # Check 5: Verify chunked prefill is disabled
-        assert not vllm_config.scheduler_config.chunked_prefill_enabled, (
+        assert not vllm_config.scheduler_config.enable_chunked_prefill, (
             "Encoder instance should disable chunked prefill (no KV cache)"
         )
 
diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
index d77a119ec60f..990aa9d92585 100644
--- a/tests/v1/engine/test_output_processor.py
+++ b/tests/v1/engine/test_output_processor.py
@@ -18,7 +18,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.engine import (
     EngineCoreEvent,
     EngineCoreEventType,
@@ -31,7 +31,7 @@
 
 
 def _ref_convert_id_to_token(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_id: int,
 ) -> str:
     """Reference impl of logprobs detokenization.
@@ -49,10 +49,15 @@ def _ref_convert_id_to_token(
 @pytest.mark.parametrize(
     "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
 )
+@pytest.mark.parametrize("stream_interval", [1, 5, 10])
 def test_incremental_detokenization(
-    request_output_kind: RequestOutputKind, dummy_test_vectors
+    request_output_kind: RequestOutputKind,
+    stream_interval: int,
+    dummy_test_vectors,
 ):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
+    output_processor = OutputProcessor(
+        dummy_test_vectors.tokenizer, log_stats=False, stream_interval=stream_interval
+    )
     engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
@@ -104,9 +109,18 @@ def test_incremental_detokenization(
             if request_id not in gen_strings:
                 gen_strings[request_id] = new_text
                 gen_tokens[request_id] = new_tokens
+                if request_output_kind == RequestOutputKind.DELTA:
+                    assert len(new_tokens) == 1, f"{len(new_tokens)=}"
             else:
                 gen_strings[request_id] += new_text
                 gen_tokens[request_id].extend(new_tokens)
+                if (
+                    request_output_kind == RequestOutputKind.DELTA
+                    and not request_output.finished
+                ):
+                    assert len(new_tokens) >= stream_interval, (
+                        f"{len(new_tokens)=}, {stream_interval=}"
+                    )
 
     # Confirmed tracked values matches what we expected.
     for idx, (ref_gen_str, ref_gen_toks) in enumerate(
diff --git a/tests/v1/engine/test_processor_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py
similarity index 87%
rename from tests/v1/engine/test_processor_multi_modal_uuids.py
rename to tests/v1/engine/test_process_multi_modal_uuids.py
index cb6865e42ef8..1b11b8af49d1 100644
--- a/tests/v1/engine/test_processor_multi_modal_uuids.py
+++ b/tests/v1/engine/test_process_multi_modal_uuids.py
@@ -7,18 +7,17 @@
 from vllm.assets.video import VideoAsset
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
 from vllm.sampling_params import SamplingParams
-from vllm.v1.engine import processor as processor_mod
-from vllm.v1.engine.processor import Processor
+from vllm.v1.engine import input_processor as input_processor_mod
+from vllm.v1.engine.input_processor import InputProcessor
 
 cherry_pil_image = ImageAsset("cherry_blossom").pil_image
 stop_pil_image = ImageAsset("stop_sign").pil_image
 baby_reading_np_ndarrays = VideoAsset("baby_reading").np_ndarrays
 
 
-# Mock processor for testing
-def _mk_processor(
+def _mock_input_processor(
     monkeypatch, *, mm_cache_gb: float = 4.0, enable_prefix_caching: bool = True
-) -> Processor:
+) -> InputProcessor:
     """
     Create a Processor instance with minimal configuration suitable for unit
     tests without accessing external resources.
@@ -36,7 +35,7 @@ def _mk_processor(
         raising=True,
     )
     monkeypatch.setattr(
-        processor_mod,
+        input_processor_mod,
         "processor_cache_from_config",
         lambda vllm_config, mm_registry: None,
         raising=True,
@@ -65,11 +64,11 @@ def __init__(self, gb: float):
         device_config=DeviceConfig(device="cpu"),
     )
 
-    return Processor(vllm_config, tokenizer=None)
+    return InputProcessor(vllm_config, tokenizer=None)
 
 
 def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
-    processor = _mk_processor(monkeypatch)
+    input_processor = _mock_input_processor(monkeypatch)
 
     prompt = {
         "prompt": "USER: <image>\nDescribe\nASSISTANT:",
@@ -79,7 +78,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
     }
 
     with pytest.raises(ValueError, match="must have same length as data"):
-        processor.process_inputs(
+        input_processor.process_inputs(
             request_id="req-1",
             prompt=prompt,  # type: ignore[arg-type]
             params=SamplingParams(),
@@ -87,7 +86,7 @@ def test_multi_modal_uuids_length_mismatch_raises(monkeypatch):
 
 
 def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
-    processor = _mk_processor(monkeypatch)
+    input_processor = _mock_input_processor(monkeypatch)
 
     prompt = {
         "prompt": "USER: <image><video>\nDescribe\nASSISTANT:",
@@ -101,7 +100,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
     }
 
     with pytest.raises(ValueError, match="must be provided if multi_modal_data"):
-        processor.process_inputs(
+        input_processor.process_inputs(
             request_id="req-2",
             prompt=prompt,  # type: ignore[arg-type]
             params=SamplingParams(),
@@ -119,7 +118,7 @@ def test_multi_modal_uuids_missing_modality_raises(monkeypatch):
 def test_multi_modal_uuids_accepts_none_and_passes_through(
     monkeypatch, mm_cache_gb: float, enable_prefix_caching: bool
 ):
-    processor = _mk_processor(
+    input_processor = _mock_input_processor(
         monkeypatch,
         mm_cache_gb=mm_cache_gb,
         enable_prefix_caching=enable_prefix_caching,
@@ -137,7 +136,7 @@ def fake_preprocess(
 
     # Monkeypatch only the bound preprocess method on this instance
     monkeypatch.setattr(
-        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
     )
 
     # Use a consistent two-image scenario across all configurations
@@ -151,7 +150,7 @@ def fake_preprocess(
         "multi_modal_uuids": mm_uuids,
     }
 
-    processor.process_inputs(
+    input_processor.process_inputs(
         request_id="req-3",
         prompt=prompt,  # type: ignore[arg-type]
         params=SamplingParams(),
@@ -163,7 +162,9 @@ def fake_preprocess(
 def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
     # When both processor cache is 0 and prefix caching disabled, the
     # processor builds overrides from request id instead of using user UUIDs.
-    processor = _mk_processor(monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False)
+    input_processor = _mock_input_processor(
+        monkeypatch, mm_cache_gb=0.0, enable_prefix_caching=False
+    )
 
     captured: dict[str, object] = {}
 
@@ -174,7 +175,7 @@ def fake_preprocess(
         return {"type": "token", "prompt_token_ids": [1]}
 
     monkeypatch.setattr(
-        processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
+        input_processor.input_preprocessor, "preprocess", fake_preprocess, raising=True
     )
 
     request_id = "req-42"
@@ -188,7 +189,7 @@ def fake_preprocess(
         "multi_modal_uuids": mm_uuids,
     }
 
-    processor.process_inputs(
+    input_processor.process_inputs(
         request_id=request_id,
         prompt=prompt,  # type: ignore[arg-type]
         params=SamplingParams(),
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 4cd26e7b41d3..9cc5a6e073a6 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -3,9 +3,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import json
-from dataclasses import fields
 from enum import Enum
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import jsonschema
 import pytest
@@ -21,16 +20,10 @@
 from vllm.platforms import current_platform
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
 from vllm.sampling_params import (
-    GuidedDecodingParams,
     SamplingParams,
     StructuredOutputsParams,
 )
 
-if TYPE_CHECKING:
-    from vllm.config.model import TokenizerMode
-else:
-    TokenizerMode = str
-
 NGRAM_SPEC_CONFIG = {
     "model": "[ngram]",
     "num_speculative_tokens": 5,
@@ -46,17 +39,45 @@
 
 PARAMS_MODELS_BACKENDS_TOKENIZER_MODE = [
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
-    ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
+    # FIXME: Since "auto" will use Mistral tokenizer and these backends do not support
+    # it, we skip these tests for now.
+    # ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", None),
+    # ("mistralai/Ministral-8B-Instruct-2410", "lm-format-enforcer", "auto", None),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", None),
+    pytest.param(
+        "mistralai/Ministral-8B-Instruct-2410",
+        "lm-format-enforcer",
+        "hf",
+        None,
+        marks=pytest.mark.skip(
+            reason=(
+                "Flaky: lm-format-enforcer intermittently returns"
+                "incomplete JSON."
+                "See https://github.com/noamgat/lm-format-enforcer/issues/169"
+            )
+        ),
+    ),
     ("mistralai/Ministral-8B-Instruct-2410", "xgrammar", "mistral", None),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", None),
-    ("Qwen/Qwen2.5-1.5B-Instruct", "lm-format-enforcer", "auto", None),
+    pytest.param(
+        "Qwen/Qwen2.5-1.5B-Instruct",
+        "lm-format-enforcer",
+        "auto",
+        None,
+        marks=pytest.mark.skip(
+            reason=(
+                "Flaky: lm-format-enforcer intermittently returns"
+                "incomplete JSON."
+                "See https://github.com/noamgat/lm-format-enforcer/issues/169"
+            )
+        ),
+    ),
     # FIXME: This tests are flaky on CI thus disabled. Tracking in Issue #24402
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", None),
     # ("mistralai/Ministral-8B-Instruct-2410", "outlines", "mistral", None),
     # ("Qwen/Qwen2.5-1.5B-Instruct", "guidance", "auto"),
     ("mistralai/Ministral-8B-Instruct-2410", "outlines", "auto", NGRAM_SPEC_CONFIG),
-    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "auto", NGRAM_SPEC_CONFIG),
+    ("mistralai/Ministral-8B-Instruct-2410", "guidance", "hf", NGRAM_SPEC_CONFIG),
     ("Qwen/Qwen2.5-1.5B-Instruct", "xgrammar", "auto", NGRAM_SPEC_CONFIG),
     ("meta-llama/Meta-Llama-3.1-8B-Instruct", "xgrammar", "auto", EAGLE_SPEC_CONFIG),
 ]
@@ -80,24 +101,6 @@ class CarDescription(BaseModel):
     car_type: CarType
 
 
-def test_guided_decoding_deprecated():
-    with pytest.warns(DeprecationWarning, match="GuidedDecodingParams is deprecated.*"):
-        guided_decoding = GuidedDecodingParams(json_object=True)
-
-    structured_outputs = StructuredOutputsParams(json_object=True)
-    assert fields(guided_decoding) == fields(structured_outputs)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp1 = SamplingParams(guided_decoding=guided_decoding)
-
-    with pytest.warns(DeprecationWarning, match="guided_decoding is deprecated.*"):
-        sp2 = SamplingParams.from_optional(guided_decoding=guided_decoding)
-
-    assert sp1 == sp2
-    assert sp1.structured_outputs == guided_decoding
-
-
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, speculative_config",
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
@@ -128,6 +131,8 @@ def test_structured_output(
         ),
         seed=120,
         tokenizer_mode=tokenizer_mode,
+        load_format="auto" if not model_name.startswith("mistralai/") else "hf",
+        config_format="auto" if not model_name.startswith("mistralai/") else "hf",
         speculative_config=speculative_config,
     )
 
@@ -602,7 +607,6 @@ def test_structured_output(
                 )
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize(
     "model_name, backend, tokenizer_mode, reasoning_parser, speculative_config",  # noqa: E501
     [
@@ -618,7 +622,7 @@ def test_structured_output(
 )
 def test_structured_output_with_reasoning_matrices(
     backend: str,
-    tokenizer_mode: TokenizerMode,
+    tokenizer_mode: str,
     reasoning_parser: str,
     model_name: str,
     speculative_config: dict[str, Any] | None,
@@ -677,12 +681,16 @@ def test_structured_output_with_reasoning_matrices(
     reasoning, content = run_reasoning_extraction(reasoner, [generated_text])
     print(f"Prompt: {prompt!r}\nReasoning: {reasoning!r}\nContent: {content!r}")
 
-    assert content is not None and reasoning is not None
-    output_json = json.loads(content)
-    jsonschema.validate(instance=output_json, schema=reasoning_schema)
+    if "Qwen3" in model_name:
+        assert content is not None
+
+    assert reasoning is not None
+
+    if content is not None:
+        output_json = json.loads(content)
+        jsonschema.validate(instance=output_json, schema=reasoning_schema)
 
 
-@pytest.mark.skip_global_cleanup
 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
     unsupported_json_schema: dict[str, Any],
@@ -694,6 +702,8 @@ def test_structured_output_auto_mode(
         max_model_len=1024,
         structured_outputs_config=dict(backend="auto"),
         tokenizer_mode=tokenizer_mode,
+        load_format="auto",
+        config_format="auto",
     )
 
     sampling_params = SamplingParams(
@@ -729,7 +739,6 @@ def test_structured_output_auto_mode(
         assert isinstance(parsed_json, dict)
 
 
-@pytest.mark.skip_global_cleanup
 def test_guidance_no_additional_properties():
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
@@ -866,13 +875,11 @@ def test_structured_output_batched_with_non_structured_outputs_requests(
                 output_json = json.loads(generated_text)
 
 
-@pytest.mark.parametrize("guided_decoding_backend", ["xgrammar"])
-def test_structured_output_with_structural_tag(
-    guided_decoding_backend: str,
-):
+@pytest.mark.parametrize("backend", ["xgrammar"])
+def test_structured_output_with_structural_tag(backend: str):
     llm = LLM(
         model="Qwen/Qwen2.5-1.5B-Instruct",
-        guided_decoding_backend=guided_decoding_backend,
+        structured_outputs_config=StructuredOutputsConfig(backend=backend),
     )
 
     structural_tag_config = {
@@ -890,7 +897,7 @@ def test_structured_output_with_structural_tag(
     sampling_params = SamplingParams(
         temperature=0.0,
         max_tokens=500,
-        guided_decoding=StructuredOutputsParams(
+        structured_outputs=StructuredOutputsParams(
             structural_tag=json.dumps(structural_tag_config)
         ),
     )
diff --git a/tests/v1/entrypoints/openai/serving_responses/test_image.py b/tests/v1/entrypoints/openai/serving_responses/test_image.py
index 980d83b787e7..be5693bbf273 100644
--- a/tests/v1/entrypoints/openai/serving_responses/test_image.py
+++ b/tests/v1/entrypoints/openai/serving_responses/test_image.py
@@ -15,10 +15,10 @@
 MAXIMUM_IMAGES = 2
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]
 
 
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index a9817313cf02..453ccc81eb14 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -49,11 +49,13 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
+PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-128}
+DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-128}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
 
-SMI_BIN=$(which nvidia-smi || which rocm-smi)
+SMI_BIN=$(which nvidia-smi || which rocm-smi || echo "")
 
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
@@ -89,8 +91,13 @@ get_model_args() {
 get_num_gpus() {
   if [[ "$SMI_BIN" == *"nvidia"* ]]; then
     echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
-  else
+  elif [[ "$SMI_BIN" == *"rocm"* ]]; then
     echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  else
+    # works for non-cuda platforms,
+    # assuming at least 1 device and
+    # let system to decide which card to use
+    echo "1"
   fi
 }
 
@@ -136,6 +143,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${PREFILL_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
@@ -177,6 +185,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${DECODE_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --kv-transfer-config '$KV_CONFIG'"
   
diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
index f51001a6ec12..7cd23805c599 100644
--- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py
+++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py
@@ -14,6 +14,7 @@
 
 import pytest
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
@@ -24,7 +25,6 @@
 from .utils import create_scheduler, create_vllm_config
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -68,7 +68,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
@@ -119,7 +119,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         pass
diff --git a/tests/v1/kv_connector/unit/test_lmcache_integration.py b/tests/v1/kv_connector/unit/test_lmcache_integration.py
index 11507d7cd4e7..33418edc325a 100644
--- a/tests/v1/kv_connector/unit/test_lmcache_integration.py
+++ b/tests/v1/kv_connector/unit/test_lmcache_integration.py
@@ -9,6 +9,12 @@
 # Assumption vs. Correctness Tests:
 # these unit tests do *not* test correctness of LMCache-side or vLLM-side logic
 # it is to ensure that assumptions LMCache makes about vLLM's interface are stable
+
+import pytest
+
+from vllm.platforms import current_platform
+
+
 def assumes(obj, attr, is_callable=False, is_instance_of=None):
     import inspect
     from dataclasses import is_dataclass
@@ -48,6 +54,9 @@ def assumes(obj, attr, is_callable=False, is_instance_of=None):
                 assert isinstance(attr_value, is_instance_of), assumption_msg
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_multimodal_interface():
     # protect against interface changes
     from vllm.multimodal.inputs import PlaceholderRange
@@ -72,6 +81,9 @@ def test_multimodal_interface():
     assert token_ids.tolist() == [0, 0, 0, 0, 4, 4369, 4369, 4369, 4369, 9]
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_config_interface():
     # protect against interface changes
     from vllm.config import VllmConfig
@@ -146,6 +158,9 @@ def test_config_interface():
     )
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(), reason="Requires libcudart.so, not available on ROCm"
+)
 def test_request_interface():
     # protect against interface changes
     from types import NoneType
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index 1c1ac915c758..ffa7d884d276 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -20,6 +20,7 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
     NixlKVConnectorStats,
 )
+from vllm.platforms import current_platform
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -69,6 +70,13 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool:
     return True
 
 
+@pytest.mark.skipif(
+    current_platform.is_rocm(),
+    reason=(
+        "hipErrorLaunchFailure when running this test, see issue:"
+        "https://github.com/ROCm/pytorch/issues/2822"
+    ),
+)
 def test_multi_shared_storage_connector_consistency():
     """
     Tests that MultiConnector with two SharedStorageConnectors saves
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
index 8e421717fea3..b7d7a10057b8 100644
--- a/tests/v1/kv_connector/unit/test_nixl_connector.py
+++ b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -407,6 +407,7 @@ def _nixl_handshake(
                 # `self.kv_cache_layout` is only forced to HND when vllm engine
                 # is started. We mock HND here.
                 kv_cache_layout="HND",
+                block_size=self.block_size,
             ),
             remote_tp_size=remote_tp_size,
         )
@@ -652,6 +653,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
                 block_lens=worker.block_len_per_layer,
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout=mismatched_layout,
+                block_size=worker.block_size,
             )
 
             with pytest.raises(RuntimeError):
@@ -706,6 +708,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
                 block_lens=[i * 2 for i in worker.block_len_per_layer],
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout="HND",
+                block_size=worker.block_size,
             )
 
             # We don't check layout for homogeneous TP and MLA for now, as the
diff --git a/tests/v1/kv_connector/unit/test_offloading_connector.py b/tests/v1/kv_connector/unit/test_offloading_connector.py
index 23b6c4802d10..69565f584ab8 100644
--- a/tests/v1/kv_connector/unit/test_offloading_connector.py
+++ b/tests/v1/kv_connector/unit/test_offloading_connector.py
@@ -19,6 +19,7 @@
 )
 from vllm.forward_context import ForwardContext
 from vllm.utils.hashing import sha256
+from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
     get_request_block_hasher,
@@ -92,7 +93,7 @@ def get_manager(self) -> OffloadingManager:
         return self.manager
 
     def get_handlers(
-        self, _
+        self, _, __
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         yield GPULoadStoreSpec, MockLoadStoreSpec, self.handler
         yield MockLoadStoreSpec, GPULoadStoreSpec, self.handler
@@ -138,7 +139,10 @@ def __init__(
         self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER)
 
         # register worker kv_caches to enable OffloadingWorker creations
-        self.worker_connector.register_kv_caches(kv_caches={"a": torch.empty(0)})
+        self.worker_connector.register_cross_layers_kv_cache(
+            kv_cache=torch.empty(0),
+            attn_backend=FlashAttentionBackend,
+        )
 
         # extract connector of scheduler
         scheduler_connector = self.scheduler.connector
diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py
index f0031643aa9d..f35f91bb3adf 100644
--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -3,7 +3,8 @@
 import tempfile
 from collections import defaultdict
 from collections.abc import Callable
-from itertools import count
+from dataclasses import dataclass
+from itertools import chain, count
 from typing import Any
 
 import torch
@@ -18,13 +19,18 @@
     VllmConfig,
 )
 from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+from vllm.distributed.kv_transfer.kv_connector.v1.base import (
+    KVConnectorBase_V1,
+    KVConnectorMetadata,
+    KVConnectorRole,
+)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
     SharedStorageConnector,
 )
 from vllm.utils.hashing import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
-from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.core.sched.scheduler import Scheduler, SchedulerOutput
 from vllm.v1.kv_cache_interface import (
     FullAttentionSpec,
     KVCacheConfig,
@@ -307,6 +313,82 @@ def wrapper(*args, **kwargs):
         return attr
 
 
+@dataclass(frozen=True)
+class MockKVConfig:
+    matched_tokens: int = 0
+    is_async: bool = False
+
+
+class MockKVConnectorMetadata(KVConnectorMetadata):
+    def __init__(self):
+        # Scheduler tests check metadata.requests
+        self.requests: list = []
+
+
+class MockKVConnector(KVConnectorBase_V1):
+    """Mock KV connector for scheduler tests, supporting both sync and async mode."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        role: KVConnectorRole,
+        kv_cache_config: KVCacheConfig | None = None,
+    ):
+        super().__init__(vllm_config, role, kv_cache_config)
+        extra_config = self._kv_transfer_config.kv_connector_extra_config
+        self.config = MockKVConfig(
+            matched_tokens=extra_config["matched_tokens"],
+            is_async=extra_config["is_async"],
+        )
+
+    def get_num_new_matched_tokens(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+    ) -> tuple[int | None, bool]:
+        return (self.config.matched_tokens, self.config.is_async)
+
+    def update_state_after_alloc(
+        self,
+        request: Request,
+        blocks: KVCacheBlocks,
+        num_external_tokens: int,
+    ):
+        pass
+
+    def build_connector_meta(
+        self, scheduler_output: SchedulerOutput
+    ) -> KVConnectorMetadata:
+        metadata = MockKVConnectorMetadata()
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for req_id in chain(
+            (req.req_id for req in scheduler_output.scheduled_new_reqs),
+            (
+                req_id
+                for req_id in cached_reqs.req_ids
+                if req_id in cached_reqs.resumed_req_ids
+            ),
+        ):
+            metadata.requests.append({"req_id": req_id})
+        return metadata
+
+    def start_load_kv(self, kv_caches, finished_req_ids):
+        pass
+
+    def wait_for_layer_load(self, layer_name):
+        pass
+
+    def save_kv_layer(self, layer_name, kv_layer, attn_metadata, **kwargs):
+        pass
+
+    def wait_for_save(self):
+        pass
+
+
 KVConnectorFactory.register_connector(
     "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__
 )
+
+KVConnectorFactory.register_connector(
+    "MockKVConnector", __name__, MockKVConnector.__name__
+)
diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py
index 0d4fa344d298..a248104e16d2 100644
--- a/tests/v1/kv_offload/test_cpu_gpu.py
+++ b/tests/v1/kv_offload/test_cpu_gpu.py
@@ -103,8 +103,8 @@ def test_transfer(
         for i in range(gpu_blocks_per_cpu_block):
             cpu_blocks_in_gpu_block_size.append(i + base_block_id)
 
-    # maybe skip a GPU block to test writing to the middle of a CPU block
-    if gpu_to_cpu:
+    # maybe skip a GPU block to test reading from the middle of a CPU block
+    if not gpu_to_cpu:
         gpu_blocks = gpu_blocks[gpu_blocks_per_cpu_block - 1 :]
         cpu_blocks_in_gpu_block_size = cpu_blocks_in_gpu_block_size[
             gpu_blocks_per_cpu_block - 1 :
diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py
index b654ea4298db..57474a3dc01e 100644
--- a/tests/v1/kv_offload/test_cpu_offloading.py
+++ b/tests/v1/kv_offload/test_cpu_offloading.py
@@ -12,8 +12,16 @@
 from vllm import LLM, SamplingParams, TokensPrompt
 from vllm.config import KVEventsConfig, KVTransferConfig
 from vllm.distributed.kv_events import BlockStored, KVEventBatch
+from vllm.platforms import current_platform
+from vllm.utils.system_utils import set_env_var
 
-CPU_BLOCK_SIZES = [16, 48]
+CPU_BLOCK_SIZES = [48]
+ATTN_BACKENDS = ["FLASH_ATTN"]
+
+if current_platform.is_cuda():
+    ATTN_BACKENDS.append("FLASHINFER")
+elif current_platform.is_rocm():
+    ATTN_BACKENDS = ["TRITON_ATTN"]
 
 
 class MockSubscriber:
@@ -63,8 +71,88 @@ def close(self):
         self.sub.close()
 
 
+def _latency_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+
+    num_times_cpu_better_than_cold = 0
+    num_tests = 10
+    total_cold_time = 0.0
+    total_gpu_hit_time = 0.0
+    total_cpu_hit_time = 0.0
+    prompt_token_ids = [0] * 10001
+    for i in tqdm(range(num_tests), desc="Running tests"):
+        prompt_token_ids[0] = i
+        prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
+
+        # run generation - this should trigger saving KV cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cold_time = time.time() - start_time
+        total_cold_time += cold_time
+
+        # run generation again - should hit the GPU prefix cache
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        gpu_hit_time = time.time() - start_time
+        total_gpu_hit_time += gpu_hit_time
+
+        # reset prefix cache to avoid GPU hit.
+        llm.reset_prefix_cache()
+
+        assert subscriber.get_new_cpu_stored_events()
+
+        # run generation again - this should trigger loading from CPU
+        start_time = time.time()
+        llm.generate(prompts, sampling_params, use_tqdm=False)
+        cpu_hit_time = time.time() - start_time
+        total_cpu_hit_time += cpu_hit_time
+
+        if cpu_hit_time < cold_time:
+            num_times_cpu_better_than_cold += 1
+
+    print("Average times:")
+    print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
+    print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
+    print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
+
+    assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+
+
+def _accuracy_test(llm: LLM, subscriber: MockSubscriber):
+    sampling_params = SamplingParams(max_tokens=1)
+    cpu_block_size = (
+        llm.llm_engine.vllm_config.kv_transfer_config.kv_connector_extra_config[
+            "block_size"
+        ]
+    )
+
+    subscriber.get_new_cpu_stored_events()
+
+    # prepend prompt to be cpu block aligned
+    prompt = "Let's count to 10. One, two, three, four,"
+    while (
+        len(llm.generate(prompt, use_tqdm=False)[0].prompt_token_ids) % cpu_block_size
+        != 0
+    ):
+        prompt = ". " + prompt
+
+    assert subscriber.get_new_cpu_stored_events()
+
+    test_count = 100
+    success_count = 0
+    for i in range(test_count):
+        if (
+            llm.generate(prompt, sampling_params, use_tqdm=False)[0].outputs[0].text
+            == " five"
+        ):
+            success_count += 1
+
+    assert success_count >= 0.5 * test_count
+
+
 @pytest.mark.parametrize("cpu_block_size", CPU_BLOCK_SIZES)
-def test_cpu_offloading(cpu_block_size: int) -> None:
+@pytest.mark.parametrize("attn_backend", ATTN_BACKENDS)
+def test_cpu_offloading(cpu_block_size: int, attn_backend: str) -> None:
     """
     Tests OffloadingConnector with CPUOffloadingSpec.
     """
@@ -92,61 +180,20 @@ def test_cpu_offloading(cpu_block_size: int) -> None:
         topic="test",
     )
 
-    llm = LLM(
-        model="meta-llama/Llama-3.2-1B-Instruct",
-        gpu_memory_utilization=0.5,
-        kv_events_config=kv_events_config,
-        kv_transfer_config=kv_transfer_config,
-    )
-
-    sampling_params = SamplingParams(temperature=0, max_tokens=1)
+    with set_env_var("VLLM_ATTENTION_BACKEND", attn_backend):
+        llm = LLM(
+            model="meta-llama/Llama-3.2-1B-Instruct",
+            gpu_memory_utilization=0.5,
+            kv_events_config=kv_events_config,
+            kv_transfer_config=kv_transfer_config,
+        )
 
     events_endpoint = events_endpoint.replace("*", "127.0.0.1")
     subscriber = MockSubscriber(events_endpoint, topic=kv_events_config.topic)
 
     try:
-        num_times_cpu_better_than_cold = 0
-        num_tests = 10
-        total_cold_time = 0.0
-        total_gpu_hit_time = 0.0
-        total_cpu_hit_time = 0.0
-        prompt_token_ids = [0] * 10001
-        for i in tqdm(range(num_tests), desc="Running tests"):
-            prompt_token_ids[0] = i
-            prompts = [TokensPrompt(prompt_token_ids=prompt_token_ids)]
-
-            # run generation - this should trigger saving KV cache
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            cold_time = time.time() - start_time
-            total_cold_time += cold_time
-
-            # run generation again - should hit the GPU prefix cache
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            gpu_hit_time = time.time() - start_time
-            total_gpu_hit_time += gpu_hit_time
-
-            # reset prefix cache to avoid GPU hit.
-            llm.reset_prefix_cache()
-
-            assert subscriber.get_new_cpu_stored_events()
-
-            # run generation again - this should trigger loading from CPU
-            start_time = time.time()
-            llm.generate(prompts, sampling_params, use_tqdm=False)
-            cpu_hit_time = time.time() - start_time
-            total_cpu_hit_time += cpu_hit_time
-
-            if cpu_hit_time < cold_time:
-                num_times_cpu_better_than_cold += 1
-
-        print("Average times:")
-        print(f"    Cold: {total_cold_time * 1000 / num_tests:.2f}ms")
-        print(f"    GPU hit: {total_gpu_hit_time * 1000 / num_tests:.2f}ms")
-        print(f"    CPU hit: {total_cpu_hit_time * 1000 / num_tests:.2f}ms")
-
-        assert num_times_cpu_better_than_cold >= 0.8 * num_tests
+        _latency_test(llm, subscriber)
+        _accuracy_test(llm, subscriber)
     finally:
         subscriber.close()
         del llm
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 354fff22dc2a..c89c33be80c1 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -47,6 +47,7 @@ def vllm_model(vllm_runner, request) -> Generator[VllmRunner, None, None]:
         max_num_batched_tokens=16,
         max_num_seqs=16,
         max_model_len=128,
+        enable_chunked_prefill=True,
         enforce_eager=True,
         # TODO: enable this once we support it for
         # prompt logprobs.
@@ -520,8 +521,8 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode):
         pytest.param(
             (
                 "eagle",
-                "meta-llama/Llama-3.1-8B-Instruct",
-                "yuhuili/EAGLE-LLaMA3.1-Instruct-8B",
+                "meta-llama/Llama-3.2-1B-Instruct",
+                "nm-testing/Llama3_2_1B_speculator.eagle3",
             ),
             marks=large_gpu_mark(min_gb=32),
         ),
@@ -540,7 +541,7 @@ def test_spec_decode_logprobs(
     """
     from vllm import LLM
 
-    prompt = "Hello world"
+    prompt = "Hello world " * 50
     sampling_params = SamplingParams(
         temperature=0, logprobs=3, max_tokens=10, ignore_eos=False
     )
@@ -581,6 +582,9 @@ def test_spec_decode_logprobs(
         seed=42,
         logprobs_mode=logprobs_mode,
         gpu_memory_utilization=0.4,
+        # Force prefill chunking
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=32,
     )
     spec_results = spec_llm.generate([prompt], sampling_params)
     # Collect logprobs outputs from spec decode LLM.
@@ -596,6 +600,84 @@ def test_spec_decode_logprobs(
     # Per-token logprobs are expected to be the same.
     assert len(ref_logprobs) == len(spec_logprobs)
     for ref_logprob, spec_logprob in zip(ref_logprobs, spec_logprobs):
-        assert math.isclose(ref_logprob.logprob, spec_logprob.logprob, abs_tol=1e-3)
+        assert math.isclose(
+            ref_logprob.logprob, spec_logprob.logprob, rel_tol=5e-2, abs_tol=1e-1
+        )
         assert ref_logprob.rank == spec_logprob.rank
         assert ref_logprob.decoded_token == spec_logprob.decoded_token
+
+
+def test_prompt_logprobs_with_chunking_and_preemption():
+    """Test that prompt logprobs are correctly returned when using
+    both chunked prefill and preemption.
+
+    This test ensures that the num_prompt_logprobs tracking persists
+    across preemptions and prefill chunks.
+    """
+
+    # Create prompts that will trigger chunking and preemption
+    prompts = [
+        "The following numbers of the sequence "
+        + ", ".join(str(i) for i in range(10))
+        + " are:",
+        "In one word, the capital of France is ",
+    ] + [f"Tell me about the number {i}: " for i in range(32)]
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=40,
+        min_tokens=20,
+        prompt_logprobs=2,  # Request prompt logprobs
+    )
+
+    with VllmRunner(
+        "Qwen/Qwen3-0.6B",
+        max_model_len=512,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=48,  # Force prefill chunking
+        num_gpu_blocks_override=32,  # Force preemptions
+        disable_log_stats=False,
+        gpu_memory_utilization=0.25,
+    ) as vllm_model:
+        metrics_before = vllm_model.llm.get_metrics()
+
+        # Generate with prompt logprobs using generate_w_logprobs which
+        # returns (output_ids, output_str, output_logprobs, prompt_logprobs)
+        outputs = vllm_model.generate_w_logprobs(
+            prompts, sampling_params=sampling_params, include_prompt_token_ids=True
+        )
+
+        # Verify that all outputs have prompt logprobs
+        for i, output in enumerate(outputs):
+            _, _, _, prompt_token_ids, prompt_logprobs = output
+            assert prompt_logprobs is not None and len(prompt_logprobs) > 0, (
+                f"Output {i} missing prompt logprobs"
+            )
+            assert len(prompt_logprobs) == len(prompt_token_ids), (
+                "Unexpected number of prompt logprob positions"
+            )
+
+            # Each position should have the requested number of logprobs
+            for pos, logprobs_dict in enumerate(prompt_logprobs):
+                if logprobs_dict is not None:  # First token may be None
+                    assert (
+                        sampling_params.prompt_logprobs
+                        <= len(logprobs_dict)
+                        <= sampling_params.prompt_logprobs + 1
+                    ), (
+                        f"Output {i} position {pos} has {len(logprobs_dict)} "
+                        f"logprobs, expected {sampling_params.prompt_logprobs}"
+                    )
+
+        # Check that we actually had preemptions
+        metrics_after = vllm_model.llm.get_metrics()
+        preemptions_before = next(
+            (m.value for m in metrics_before if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions_after = next(
+            (m.value for m in metrics_after if m.name == "vllm:num_preemptions"), 0
+        )
+        preemptions = preemptions_after - preemptions_before
+        assert preemptions > 0, "Test did not trigger any preemptions"
+
+        print(f"Test passed with {preemptions} preemptions")
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index 915b9957031d..1684252174d3 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -22,14 +22,6 @@ def test_n_gt_1(llm):
     assert len(outputs[0].outputs) == 3
 
 
-def test_best_of(llm):
-    """Raise a ValueError since best_of is deprecated."""
-
-    params = SamplingParams(n=2, best_of=3)
-    with pytest.raises(ValueError):
-        _ = llm.generate(PROMPT, params)
-
-
 def test_penalties(llm):
     """Check that we do not get errors if applied."""
 
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 89d0ec769ac0..9436ab471c21 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -103,16 +103,23 @@ def test_prepare_next_token_ids():
         mock_request.num_computed_tokens = 0
         mock_requests[req_id] = mock_request
 
+    # explicitly discard the last request
+    discarded_req_mask = torch.tensor(
+        [False, False, False, True], dtype=torch.bool, device=device
+    )
     sampled_token_ids = [
         [0, 1, -1, -1, -1],  # 1 accepted, 3 rejected, "1" sampled
         [0, 1, 2, 3, 4],  # all accepted, "4" sampled
         [-1, -1, -1, -1, -1],  # sampling skipped, use backup token "30"
-        [-1, -1, -1, -1, -1],  # this request will be discarded
+        [0, 1, 2, -1, -1],  # explicitly discarded, sampling should be ignored
     ]
     sampled_token_ids_tensor = torch.tensor(
         sampled_token_ids, dtype=torch.int32, device=device
     )
     sampled_token_ids_cpu = [[i for i in seq if i != -1] for seq in sampled_token_ids]
+    for i in range(len(sampled_token_ids_cpu)):
+        if discarded_req_mask[i]:
+            sampled_token_ids_cpu[i] = []
 
     expected_next_token_ids_cpu = [1, 4, 30, 40]
     expected_next_token_ids_tensor = torch.tensor(
@@ -136,9 +143,6 @@ def test_prepare_next_token_ids():
         device=device,
     )
 
-    discarded_req_indices = torch.tensor([3], dtype=torch.int64, device=device)
-    num_discarded_reqs = 1
-
     expected_valid_sampled_tokens_count = torch.tensor(
         [2, 5, 0, 0], dtype=torch.int32, device=device
     )
@@ -149,8 +153,7 @@ def test_prepare_next_token_ids():
             sampled_token_ids_tensor,
             mock_requests,
             mock_input_batch,
-            discarded_req_indices,
-            num_discarded_reqs,
+            discarded_req_mask,
         )
     )
 
@@ -256,11 +259,6 @@ def test_prepare_inputs_padded():
     - Request 3: query_len = 3, rejected = 2
 
     Expected outputs:
-    token_indices: [0, 1, 2,
-                    3, 4, 5,
-                    6, 7, 8]
-    Reason: Deferred computation should not disturb the original indices.
-
     token_indices_to_sample: [1, 5, 6]
     Reason: After accounting for rejections, these are the valid token positions
             from the original indices to sample from.
@@ -268,9 +266,6 @@ def test_prepare_inputs_padded():
 
     device = torch.device(current_platform.device_type)
 
-    expected_token_indices = torch.tensor(
-        [0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.int32, device=device
-    )
     expected_token_indices_to_sample = torch.tensor(
         [1, 5, 6], dtype=torch.int32, device=device
     )
@@ -305,15 +300,12 @@ def test_prepare_inputs_padded():
 
     proposer = _create_proposer("eagle", num_speculative_tokens)
 
-    output_metadata, token_indices, token_indices_to_sample = (
-        proposer.prepare_inputs_padded(
-            common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
-        )
+    output_metadata, token_indices_to_sample = proposer.prepare_inputs_padded(
+        common_attn_metadata, spec_decode_metadata, valid_sampled_tokens_count
     )
 
     assert output_metadata.max_query_len == 3
     assert torch.equal(output_metadata.query_start_loc, expected_query_start_loc)
-    assert torch.equal(token_indices, expected_token_indices)
     assert torch.equal(token_indices_to_sample, expected_token_indices_to_sample)
 
 
@@ -321,6 +313,7 @@ def test_prepare_inputs_padded():
 @pytest.mark.parametrize("attn_backend", get_attn_backend_list_based_on_platform())
 @pytest.mark.parametrize("pp_size", [1, 2])
 @pytest.mark.parametrize("use_distinct_embed_tokens", [True, False])
+@pytest.mark.parametrize("use_distinct_lm_head", [True, False])
 @mock.patch("vllm.v1.spec_decode.eagle.get_pp_group")
 @mock.patch("vllm.v1.spec_decode.eagle.get_layers_from_vllm_config")
 @mock.patch("vllm.v1.spec_decode.eagle.get_model")
@@ -332,6 +325,7 @@ def test_load_model(
     attn_backend,
     pp_size,
     use_distinct_embed_tokens,
+    use_distinct_lm_head,
     monkeypatch,
 ):
     monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
@@ -347,12 +341,13 @@ def test_load_model(
 
     # Setup draft model mock
     mock_model = mock.MagicMock()
+    mock_model.model = mock.MagicMock()
+    mock_model.has_own_embed_tokens = use_distinct_embed_tokens
     if use_distinct_embed_tokens:
-        # Some models can have a different hidden size than the target model,
-        # so we test that their embed_tokens doesn't get overwritten
-        mock_model.model.embed_tokens.weight.shape = (131072, 2048)
-    else:
-        mock_model.model.embed_tokens.weight.shape = (131072, 4096)
+        mock_model.model.embed_tokens = mock.MagicMock()
+    mock_model.has_own_lm_head = use_distinct_lm_head
+    if use_distinct_lm_head:
+        mock_model.lm_head = mock.MagicMock()
 
     mock_get_model.return_value = mock_model
 
@@ -388,15 +383,13 @@ class _TargetModelStub(LlamaForCausalLM):
 
     target_model = mock.create_autospec(_TargetModelStub, instance=True)
     target_model.model = mock.MagicMock()
-    target_model.model.embed_tokens.weight.shape = (131072, 4096)
+    target_model.lm_head = mock.MagicMock()
+    target_model.model.embed_tokens = mock.MagicMock()
 
     from vllm.model_executor.models import SupportsMultiModal
 
     assert not isinstance(target_model, SupportsMultiModal)
 
-    if method == "eagle":
-        target_model.lm_head = mock.MagicMock()
-
     # Create proposer using the helper function
     proposer = _create_proposer(method, num_speculative_tokens=8)
 
@@ -406,18 +399,18 @@ class _TargetModelStub(LlamaForCausalLM):
     # Verify common interactions
     mock_get_model.assert_called_once()
 
-    # Verify that EAGLE models gain the lm head from the target model
-    if method == "eagle":
-        assert proposer.model.lm_head == target_model.lm_head
+    # Verify that the lm head is set correctly
+    if use_distinct_lm_head:
+        assert proposer.model.lm_head is not target_model.lm_head
+    else:
+        assert proposer.model.lm_head is target_model.lm_head
 
     # Verify that the embed tokens are set correctly
     # If pp_size is > 1, the embed tokens should be distinct
     if pp_size > 1 or use_distinct_embed_tokens:
-        assert proposer.model.model.embed_tokens != target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is not target_model.model.embed_tokens
     else:
-        # When pp_size is 1 and the draft and target models have
-        # embed_tokens of the same shape, they should be shared.
-        assert proposer.model.model.embed_tokens == target_model.model.embed_tokens
+        assert proposer.model.model.embed_tokens is target_model.model.embed_tokens
 
 
 @pytest.mark.parametrize("method", ["eagle", "eagle3"])
diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py
index 6d59b58e739e..c5c0491abaf7 100644
--- a/tests/v1/spec_decode/test_mtp.py
+++ b/tests/v1/spec_decode/test_mtp.py
@@ -67,6 +67,10 @@ def test_mtp_load_model_unified(mock_get_model, mock_get_layers, mock_get_pp_gro
     mock_model = mock.MagicMock()
     mock_model.model.embed_tokens.weight.shape = (131072, 4096)
     mock_get_model.return_value = mock_model
+    # MTP does not have its own embed_tokens or lm_head
+    # so it should share them with the target model
+    mock_model.has_own_embed_tokens = False
+    mock_model.has_own_lm_head = False
 
     target_attn_layers = {"target_attn_1": mock.MagicMock()}
     all_attn_layers = {**target_attn_layers, "draft_attn_1": mock.MagicMock()}
diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py
index 6958d62dc7e9..a4ee53008ce8 100644
--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -3,6 +3,7 @@
 
 import math
 
+import pytest
 import torch
 
 from tests.v1.attention.utils import (
@@ -11,9 +12,16 @@
     try_get_attention_backend,
 )
 from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available
 from vllm.config import ParallelConfig, SpeculativeConfig
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 
+if not is_flash_attn_varlen_func_available():
+    pytest.skip(
+        "This test requires flash_attn_varlen_func, but it's not available.",
+        allow_module_level=True,
+    )
+
 
 class MockAttentionLayer(torch.nn.Module):
     _q_scale = torch.tensor(1.0, dtype=torch.float32, device="cuda")
diff --git a/tests/v1/test_outputs.py b/tests/v1/test_outputs.py
index af9df844249e..89d551e344cf 100644
--- a/tests/v1/test_outputs.py
+++ b/tests/v1/test_outputs.py
@@ -43,7 +43,7 @@ def test_slice_without_cu_num_generated_tokens(self):
             cu_num_generated_tokens=None,
         )
 
-        sliced = logprobsLists.slice(1, 3)
+        sliced = logprobsLists.slice_request(1, num_positions=2)
         assert sliced.logprob_token_ids == [[2], [3]]
         assert sliced.logprobs == [[0.2], [0.3]]
         assert sliced.sampled_token_ranks == [2, 3]
@@ -51,7 +51,7 @@ def test_slice_without_cu_num_generated_tokens(self):
 
     def test_slice_from_start(self):
         """Test slicing from the start position"""
-        sliced = self.logprobsLists.slice(0, 2)
+        sliced = self.logprobsLists.slice_request(0, num_positions=5)
         assert len(sliced.logprob_token_ids) == 5
         assert sliced.logprob_token_ids == [
             [1, 2],
@@ -60,11 +60,11 @@ def test_slice_from_start(self):
             [7, 8],
             [9, 10],
         ]
-        assert sliced.cu_num_generated_tokens == [0, 2, 5]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_from_middle(self):
         """Test slicing from the middle position"""
-        sliced = self.logprobsLists.slice(1, 3)
+        sliced = self.logprobsLists.slice_request(1, num_positions=7)
         assert len(sliced.logprob_token_ids) == 7
         assert sliced.logprob_token_ids == [
             [5, 6],
@@ -75,27 +75,25 @@ def test_slice_from_middle(self):
             [15, 16],
             [17, 18],
         ]
-        assert sliced.cu_num_generated_tokens == [0, 3, 7]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_single_request(self):
         """Test slicing a single request"""
-        sliced = self.logprobsLists.slice(1, 2)
+        sliced = self.logprobsLists.slice_request(1, num_positions=3)
         assert len(sliced.logprob_token_ids) == 3
         assert sliced.logprob_token_ids == [[5, 6], [7, 8], [9, 10]]
-        assert sliced.cu_num_generated_tokens == [0, 3]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_last_request(self):
         """Test slicing the last request"""
-        sliced = self.logprobsLists.slice(2, 3)
+        sliced = self.logprobsLists.slice_request(2, num_positions=4)
         assert len(sliced.logprob_token_ids) == 4
         assert sliced.logprob_token_ids == [[11, 12], [13, 14], [15, 16], [17, 18]]
-        assert sliced.cu_num_generated_tokens == [0, 4]
+        assert sliced.cu_num_generated_tokens is None
 
     def test_slice_all_requests(self):
         """Test slicing all requests (full slice)"""
-        sliced = self.logprobsLists.slice(0, 3)
+        sliced = self.logprobsLists.slice_request(0, num_positions=9)
         assert len(sliced.logprob_token_ids) == 9  # All tokens
         assert sliced.logprob_token_ids == self.logprobsLists.logprob_token_ids
-        assert (
-            sliced.cu_num_generated_tokens == self.logprobsLists.cu_num_generated_tokens
-        )
+        assert sliced.cu_num_generated_tokens is None
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index b02d9a657407..89669ee8b71a 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -5,8 +5,8 @@
 import pytest
 import torch
 
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import MultipleOf
+from vllm.attention.layer import Attention
 from vllm.config import (
     CacheConfig,
     ModelConfig,
@@ -185,7 +185,9 @@ def _make_mock_backend_for_kernel_block_size(
     supported_sizes: list[int | MultipleOf],
 ):
     class _MockBackend:
-        supported_kernel_block_sizes = supported_sizes
+        @staticmethod
+        def get_supported_kernel_block_sizes():
+            return supported_sizes
 
     return _MockBackend()
 
@@ -483,7 +485,10 @@ def test_kv_cache_stride_order(monkeypatch, model_runner):
     # Permutation that gets you back to expected kv shape
     for test_stride in ((1, 4, 0, 2, 3), (0, 1, 2, 3, 4)):
 
-        def rnd_stride_order(test_stride=test_stride):
+        def rnd_stride_order(
+            include_num_layers_dimension: bool = False, test_stride=test_stride
+        ):
+            assert not include_num_layers_dimension
             return test_stride
 
         # Patch the attention backend class and re-trigger the KV cache creation
@@ -956,7 +961,7 @@ def test_hybrid_block_table_initialization():
     max_num_reqs = 10
     max_num_blocks_per_req = 20
     max_num_batched_tokens = 512
-    dcp_kv_cache_interleave_size = 8
+    cp_kv_cache_interleave_size = 8
 
     block_table = BlockTable(
         block_size=block_size,
@@ -966,7 +971,7 @@ def test_hybrid_block_table_initialization():
         pin_memory=False,
         device=torch.device(DEVICE),
         kernel_block_size=kernel_block_sizes[0],
-        dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
     )
 
     # Verify hybrid block configuration
@@ -985,8 +990,10 @@ def test_hybrid_block_table_initialization():
     req_index = 0
     block_table.append_row(kvcache_manager_blocks, req_index)
     # Get expected kernel blocks from the implementation for verification.
-    expected_kernel_blocks = block_table._map_to_kernel_blocks(
-        np.array(kvcache_manager_blocks)
+    expected_kernel_blocks = block_table.map_to_kernel_blocks(
+        np.array(kvcache_manager_blocks),
+        block_table.blocks_per_kv_block,
+        block_table._kernel_block_arange,
     )
     # Verify block table state
     assert block_table.num_blocks_per_row[req_index] == len(expected_kernel_blocks)
diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py
new file mode 100644
index 000000000000..f7255fae05a4
--- /dev/null
+++ b/tests/v1/worker/test_gpu_profiler.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+import vllm.envs as envs
+from vllm.profiler.gpu_profiler import WorkerProfiler
+
+
+class ConcreteWorkerProfiler(WorkerProfiler):
+    """
+    A basic implementation of a worker profiler for testing purposes.
+    """
+
+    def __init__(self):
+        self.start_call_count = 0
+        self.stop_call_count = 0
+        self.should_fail_start = False
+        super().__init__()
+
+    def _start(self) -> None:
+        if self.should_fail_start:
+            raise RuntimeError("Simulated start failure")
+        self.start_call_count += 1
+
+    def _stop(self) -> None:
+        self.stop_call_count += 1
+
+
+@pytest.fixture(autouse=True)
+def reset_mocks():
+    """Fixture to reset mocks and env variables before each test."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 0
+    envs.VLLM_PROFILER_MAX_ITERS = 0
+
+
+def test_immediate_start_stop():
+    """Test standard start without delay."""
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    assert profiler._running is True
+    assert profiler._active is True
+    assert profiler.start_call_count == 1
+
+    profiler.stop()
+    assert profiler._running is False
+    assert profiler._active is False
+    assert profiler.stop_call_count == 1
+
+
+def test_delayed_start():
+    """Test that profiler waits for N steps before actually starting."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    # User requests start
+    profiler.start()
+
+    # Should be active (request accepted) but not running (waiting for delay)
+    assert profiler._active is True
+    assert profiler._running is False
+    assert profiler.start_call_count == 0
+
+    # Step 1
+    profiler.step()
+    assert profiler._running is False
+
+    # Step 2 (Threshold reached)
+    profiler.step()
+    assert profiler._running is True
+    assert profiler.start_call_count == 1
+
+
+def test_max_iterations():
+    """Test that profiler stops automatically after max iterations."""
+    envs.VLLM_PROFILER_MAX_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    assert profiler._running is True
+
+    # Iteration 1
+    profiler.step()  # profiling_count becomes 1
+    assert profiler._running is True
+
+    # Iteration 2
+    profiler.step()  # profiling_count becomes 2
+    assert profiler._running is True
+
+    # Iteration 3 (Exceeds max)
+    profiler.step()  # profiling_count becomes 3
+
+    # Should have stopped now
+    assert profiler._running is False
+    assert profiler.stop_call_count == 1
+
+
+def test_delayed_start_and_max_iters():
+    """Test combined delayed start and max iterations."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    envs.VLLM_PROFILER_MAX_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+
+    # Step 1
+    profiler.step()
+    assert profiler._running is False
+    assert profiler._active is True
+
+    # Step 2 (Starts now)
+    profiler.step()
+    assert profiler._profiling_for_iters == 1
+    assert profiler._running is True
+    assert profiler._active is True
+
+    # Next iteration
+    profiler.step()
+    assert profiler._profiling_for_iters == 2
+    assert profiler._running is True
+
+    # Iteration 2 (exceeds max)
+    profiler.step()
+
+    # Should have stopped now
+    assert profiler._running is False
+    assert profiler.stop_call_count == 1
+
+
+def test_idempotency():
+    """Test that calling start/stop multiple times doesn't break logic."""
+    profiler = ConcreteWorkerProfiler()
+
+    # Double Start
+    profiler.start()
+    profiler.start()
+    assert profiler.start_call_count == 1  # Should only start once
+
+    # Double Stop
+    profiler.stop()
+    profiler.stop()
+    assert profiler.stop_call_count == 1  # Should only stop once
+
+
+def test_step_inactive():
+    """Test that stepping while inactive does nothing."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 2
+    profiler = ConcreteWorkerProfiler()
+
+    # Not started yet
+    profiler.step()
+    profiler.step()
+
+    # Even though we stepped 2 times, start shouldn't happen because active=False
+    assert profiler.start_call_count == 0
+
+
+def test_start_failure():
+    """Test behavior when the underlying _start method raises exception."""
+    profiler = ConcreteWorkerProfiler()
+    profiler.should_fail_start = True
+
+    profiler.start()
+
+    # Exception caught in _call_start
+    assert profiler._running is False  # Should not mark as running
+    assert profiler._active is True  # Request is still considered active
+    assert profiler.start_call_count == 0  # Logic failed inside start
+
+
+def test_shutdown():
+    """Test that shutdown calls stop only if running."""
+    profiler = ConcreteWorkerProfiler()
+
+    # Case 1: Not running
+    profiler.shutdown()
+    assert profiler.stop_call_count == 0
+
+    # Case 2: Running
+    profiler.start()
+    profiler.shutdown()
+    assert profiler.stop_call_count == 1
+
+
+def test_mixed_delay_and_stop():
+    """Test manual stop during the delay period."""
+    envs.VLLM_PROFILER_DELAY_ITERS = 5
+    profiler = ConcreteWorkerProfiler()
+
+    profiler.start()
+    profiler.step()
+    profiler.step()
+
+    # User cancels before delay finishes
+    profiler.stop()
+    assert profiler._active is False
+
+    # Further steps should not trigger start
+    profiler.step()
+    profiler.step()
+    profiler.step()
+
+    assert profiler.start_call_count == 0
diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py
index f987b09e603e..bcf5611e3522 100644
--- a/tests/v1/worker/test_utils.py
+++ b/tests/v1/worker/test_utils.py
@@ -7,7 +7,7 @@
 
 
 def test_bind_kv_cache():
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     ctx = {
         "layers.0.self_attn": Attention(32, 128, 0.1),
@@ -35,7 +35,7 @@ def test_bind_kv_cache():
 
 
 def test_bind_kv_cache_non_attention():
-    from vllm.attention import Attention
+    from vllm.attention.layer import Attention
 
     # example from Jamba PP=2
     ctx = {
diff --git a/tests/weight_loading/models-amd.txt b/tests/weight_loading/models-amd.txt
new file mode 100644
index 000000000000..e31e904c08af
--- /dev/null
+++ b/tests/weight_loading/models-amd.txt
@@ -0,0 +1,3 @@
+fp8, amd/Meta-Llama-3.1-8B-Instruct-FP8-KV, main
+None, amd/Llama-3.2-1B-Instruct-FP8-KV, main
+fp8, amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV, main
diff --git a/tests/weight_loading/models-large-amd.txt b/tests/weight_loading/models-large-amd.txt
new file mode 100644
index 000000000000..b6f5b4b16b37
--- /dev/null
+++ b/tests/weight_loading/models-large-amd.txt
@@ -0,0 +1,3 @@
+fp8, amd/Meta-Llama-3.1-70B-Instruct-FP8-KV, main
+None, microsoft/phi-4, main
+fp8, amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV, main
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 5ea543f4cb1e..88be5cd778ff 100755
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -1,94 +1,125 @@
 #!/usr/bin/env bash
 set -ex
 
-# prepare workspace directory
-WORKSPACE=$1
-if [ -z "$WORKSPACE" ]; then
-    export WORKSPACE=$(pwd)/ep_kernels_workspace
-fi
-
-if [ ! -d "$WORKSPACE" ]; then
-    mkdir -p $WORKSPACE
-fi
+# usage: ./install_python_libraries.sh [options]
+#   --workspace <dir>    workspace directory (default: ./ep_kernels_workspace)
+#   --mode <mode>        "install" (default) or "wheel"
+#   --pplx-ref <commit>  pplx-kernels commit hash
+#   --deepep-ref <commit> DeepEP commit hash
 
-# configurable pip command (default: pip3)
-PIP_CMD=${PIP_CMD:-pip3}
 CUDA_HOME=${CUDA_HOME:-/usr/local/cuda}
+PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"}
+DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"}
+NVSHMEM_VER=3.3.9
+WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace}
+MODE=${MODE:-install}
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --workspace)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --workspace requires an argument." >&2
+                exit 1
+            fi
+            WORKSPACE="$2"
+            shift 2
+            ;;
+        --mode)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --mode requires an argument." >&2
+                exit 1
+            fi
+            MODE="$2"
+            shift 2
+            ;;
+        --pplx-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --pplx-ref requires an argument." >&2
+                exit 1
+            fi
+            PPLX_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        --deepep-ref)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --deepep-ref requires an argument." >&2
+                exit 1
+            fi
+            DEEPEP_COMMIT_HASH="$2"
+            shift 2
+            ;;
+        *)
+            echo "Error: Unknown argument '$1'" >&2
+            exit 1
+            ;;
+    esac
+done
+
+mkdir -p "$WORKSPACE"
+
+WHEEL_DIR="$WORKSPACE/dist"
+mkdir -p "$WHEEL_DIR"
+
+pushd "$WORKSPACE"
 
 # install dependencies if not installed
-$PIP_CMD install cmake torch ninja
-
-# build nvshmem
-pushd $WORKSPACE
-mkdir -p nvshmem_src
-wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
-tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1
-pushd nvshmem_src
-wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch
-git init
-git apply -vvv nvshmem.patch
-
-# assume CUDA_HOME is set correctly
-if [ -z "$CUDA_HOME" ]; then
-    echo "CUDA_HOME is not set, please set it to your CUDA installation directory."
-    exit 1
+if [ -z "$VIRTUAL_ENV" ]; then
+  uv pip install --system cmake torch ninja
+else
+  uv pip install cmake torch ninja
 fi
 
-# assume TORCH_CUDA_ARCH_LIST is set correctly
-if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
-    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+# fetch nvshmem
+ARCH=$(uname -m)
+case "${ARCH,,}" in
+  x86_64|amd64)
+    NVSHMEM_SUBDIR="linux-x86_64"
+    NVSHMEM_FILE="libnvshmem-linux-x86_64-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  aarch64|arm64)
+    NVSHMEM_SUBDIR="linux-sbsa"
+    NVSHMEM_FILE="libnvshmem-linux-sbsa-${NVSHMEM_VER}_cuda12-archive.tar.xz"
+    ;;
+  *)
+    echo "Unsupported architecture: ${ARCH}" >&2
     exit 1
-fi
-
-# disable all features except IBGDA
-export NVSHMEM_IBGDA_SUPPORT=1
-
-export NVSHMEM_SHMEM_SUPPORT=0
-export NVSHMEM_UCX_SUPPORT=0
-export NVSHMEM_USE_NCCL=0
-export NVSHMEM_PMIX_SUPPORT=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-export NVSHMEM_USE_GDRCOPY=0
-export NVSHMEM_IBRC_SUPPORT=0
-export NVSHMEM_BUILD_TESTS=0
-export NVSHMEM_BUILD_EXAMPLES=0
-export NVSHMEM_MPI_SUPPORT=0
-export NVSHMEM_BUILD_HYDRA_LAUNCHER=0
-export NVSHMEM_BUILD_TXZ_PACKAGE=0
-export NVSHMEM_TIMEOUT_DEVICE_POLLING=0
-
-cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install
-cmake --build $WORKSPACE/nvshmem_build/ --target install
-
+    ;;
+esac
+
+NVSHMEM_URL="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/${NVSHMEM_SUBDIR}/${NVSHMEM_FILE}"
+
+pushd "$WORKSPACE"
+echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
+curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
+tar -xf "${NVSHMEM_FILE}"
+mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
+rm -f "${NVSHMEM_FILE}"
+rm -rf nvshmem/lib/bin nvshmem/lib/share
 popd
 
-export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH
+export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem/lib/cmake:$CMAKE_PREFIX_PATH
 
 is_git_dirty() {
     local dir=$1
     pushd "$dir" > /dev/null
-
-    if [ -d ".git" ] && [ -n "$(git status --porcelain 2>/dev/null)" ]; then
+    if [ -d ".git" ] && [ -n "$(git status --porcelain 3>/dev/null)" ]; then
         popd > /dev/null
-        return 0  # dirty (true)
+        return 0
     else
         popd > /dev/null
-        return 1  # clean (false)
+        return 1
     fi
 }
 
-# Function to handle git repository cloning with dirty/incomplete checks
 clone_repo() {
     local repo_url=$1
     local dir_name=$2
     local key_file=$3
     local commit_hash=$4
-
     if [ -d "$dir_name" ]; then
-        # Check if directory has uncommitted changes (dirty)
         if is_git_dirty "$dir_name"; then
             echo "$dir_name directory is dirty, skipping clone"
-        # Check if clone failed (directory exists but not a valid git repo or missing key files)
         elif [ ! -d "$dir_name/.git" ] || [ ! -f "$dir_name/$key_file" ]; then
             echo "$dir_name directory exists but clone appears incomplete, cleaning up and re-cloning"
             rm -rf "$dir_name"
@@ -99,7 +130,7 @@ clone_repo() {
                 cd ..
             fi
         else
-            echo "$dir_name directory exists and appears complete; manually update if needed"
+            echo "$dir_name directory exists and appears complete"
         fi
     else
         git clone "$repo_url"
@@ -111,17 +142,55 @@ clone_repo() {
     fi
 }
 
-# build and install pplx, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py" "c336faf"
-cd pplx-kernels
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+deepep_cuda13_patch() {
+    cuda_version_major=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2)
+    if [ ${cuda_version_major} -ge 13 ]; then
+        sed -i "s|f'{nvshmem_dir}/include']|f'{nvshmem_dir}/include', '${CUDA_HOME}/include/cccl']|" "setup.py"
+    fi
+}
 
-# build and install deepep, require pytorch installed
-pushd $WORKSPACE
-clone_repo "https://github.com/deepseek-ai/DeepEP" "DeepEP" "setup.py" "73b6ea4"
-cd DeepEP
-export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-$PIP_CMD install --no-build-isolation -vvv -e .
-popd
+do_build() {
+    local repo=$1
+    local name=$2
+    local key=$3
+    local commit=$4
+    local extra_env=$5
+
+    pushd "$WORKSPACE"
+    clone_repo "$repo" "$name" "$key" "$commit"
+    cd "$name"
+
+    if [ "$name" == "DeepEP" ]; then
+        deepep_cuda13_patch
+    fi
+
+    if [ "$MODE" = "install" ]; then
+        echo "Installing $name into environment"
+        eval "$extra_env" uv pip install --no-build-isolation -vvv .
+    else
+        echo "Building $name wheel into $WHEEL_DIR"
+        eval "$extra_env" uv build --wheel --no-build-isolation -vvv --out-dir "$WHEEL_DIR" .
+    fi
+    popd
+}
+
+# build pplx-kernels
+do_build \
+    "https://github.com/ppl-ai/pplx-kernels" \
+    "pplx-kernels" \
+    "setup.py" \
+    "$PPLX_COMMIT_HASH" \
+    ""
+
+# build DeepEP
+do_build \
+    "https://github.com/deepseek-ai/DeepEP" \
+    "DeepEP" \
+    "setup.py" \
+    "$DEEPEP_COMMIT_HASH" \
+    "export NVSHMEM_DIR=$WORKSPACE/nvshmem; "
+
+if [ "$MODE" = "wheel" ]; then
+    echo "All wheels written to $WHEEL_DIR"
+    ls -l "$WHEEL_DIR"
+fi
diff --git a/tools/install_deepgemm.sh b/tools/install_deepgemm.sh
index 4f2cd302c3ef..ee9a5dd4aa64 100755
--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
-# Script to install DeepGEMM from source
-# This script can be used both in Docker builds and by users locally
-
+# Script to build and/or install DeepGEMM from source
+# Default: build and install immediately
+# Optional: build wheels to a directory for later installation (useful in multi-stage builds)
 set -e
 
 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
 DEEPGEMM_GIT_REF="594953acce41793ae00a1233eb516044d604bcb6"
+WHEEL_DIR=""
 
 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -27,11 +28,20 @@ while [[ $# -gt 0 ]]; do
             CUDA_VERSION="$2"
             shift 2
             ;;
+        --wheel-dir)
+            if [[ -z "$2" || "$2" =~ ^- ]]; then
+                echo "Error: --wheel-dir requires a directory path." >&2
+                exit 1
+            fi
+            WHEEL_DIR="$2"
+            shift 2
+            ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo "Options:"
             echo "  --ref REF          Git reference to checkout (default: $DEEPGEMM_GIT_REF)"
             echo "  --cuda-version VER CUDA version (auto-detected if not provided)"
+            echo "  --wheel-dir PATH   If set, build wheel into PATH but do not install"
             echo "  -h, --help         Show this help message"
             exit 0
             ;;
@@ -57,16 +67,15 @@ fi
 CUDA_MAJOR="${CUDA_VERSION%%.*}"
 CUDA_MINOR="${CUDA_VERSION#${CUDA_MAJOR}.}"
 CUDA_MINOR="${CUDA_MINOR%%.*}"
-
 echo "CUDA version: $CUDA_VERSION (major: $CUDA_MAJOR, minor: $CUDA_MINOR)"
 
 # Check CUDA version requirement
 if [ "$CUDA_MAJOR" -lt 12 ] || { [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -lt 8 ]; }; then
-    echo "Skipping DeepGEMM installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
+    echo "Skipping DeepGEMM build/installation (requires CUDA 12.8+ but got ${CUDA_VERSION})"
     exit 0
 fi
 
-echo "Installing DeepGEMM from source..."
+echo "Preparing DeepGEMM build..."
 echo "Repository: $DEEPGEMM_GIT_REPO"
 echo "Reference: $DEEPGEMM_GIT_REF"
 
@@ -76,23 +85,31 @@ trap 'rm -rf "$INSTALL_DIR"' EXIT
 
 # Clone the repository
 git clone --recursive --shallow-submodules "$DEEPGEMM_GIT_REPO" "$INSTALL_DIR/deepgemm"
-
-echo "🏗️  Building DeepGEMM"
 pushd "$INSTALL_DIR/deepgemm"
 
 # Checkout the specific reference
 git checkout "$DEEPGEMM_GIT_REF"
 
-# Build DeepGEMM
+# Clean previous build artifacts
 # (Based on https://github.com/deepseek-ai/DeepGEMM/blob/main/install.sh)
-rm -rf build dist
-rm -rf *.egg-info
+rm -rf build dist *.egg-info
+
+# Build wheel
+echo "🏗️  Building DeepGEMM wheel..."
 python3 setup.py bdist_wheel
 
-# Install the wheel
+# If --wheel-dir was specified, copy wheels there and exit
+if [ -n "$WHEEL_DIR" ]; then
+    mkdir -p "$WHEEL_DIR"
+    cp dist/*.whl "$WHEEL_DIR"/
+    echo "✅ Wheel built and copied to $WHEEL_DIR"
+    popd
+    exit 0
+fi
+
+# Default behaviour: install built wheel
 if command -v uv >/dev/null 2>&1; then
     echo "Installing DeepGEMM wheel using uv..."
-    # Use --system in Docker contexts, respect user's environment otherwise
     if [ -n "$VLLM_DOCKER_BUILD_CONTEXT" ]; then
         uv pip install --system dist/*.whl
     else
@@ -104,5 +121,4 @@ else
 fi
 
 popd
-
 echo "✅ DeepGEMM installation completed successfully"
diff --git a/tools/install_nixl_from_source_ubuntu.py b/tools/install_nixl_from_source_ubuntu.py
index 4a20b6b7bb8f..b8a55c615426 100644
--- a/tools/install_nixl_from_source_ubuntu.py
+++ b/tools/install_nixl_from_source_ubuntu.py
@@ -95,6 +95,7 @@ def install_system_dependencies():
         "meson",
         "libtool",
         "libtool-bin",
+        "pkg-config",
     ]
     run_command(["apt-get", "update"])
     run_command(["apt-get", "install", "-y"] + apt_packages)
@@ -175,6 +176,7 @@ def build_and_install_prerequisites(args):
     build_env["LD_LIBRARY_PATH"] = (
         f"{ucx_lib_path}:{ucx_plugin_path}:{existing_ld_path}".strip(":")
     )
+    build_env["LDFLAGS"] = "-Wl,-rpath,$ORIGIN"
     print(f"--> Using LD_LIBRARY_PATH: {build_env['LD_LIBRARY_PATH']}", flush=True)
 
     temp_wheel_dir = os.path.join(ROOT_DIR, "temp_wheelhouse")
diff --git a/tools/pre_commit/check_pickle_imports.py b/tools/pre_commit/check_pickle_imports.py
index b96a6701333d..13e5a0eda751 100644
--- a/tools/pre_commit/check_pickle_imports.py
+++ b/tools/pre_commit/check_pickle_imports.py
@@ -27,8 +27,8 @@
     "vllm/distributed/device_communicators/shm_broadcast.py",
     "vllm/distributed/device_communicators/shm_object_storage.py",
     "vllm/utils/hashing.py",
+    "tests/tokenizers_/test_hf.py",
     "tests/utils_/test_hashing.py",
-    "tests/tokenization/test_cached_tokenizer.py",
     "benchmarks/kernels/graph_machete_bench.py",
     "benchmarks/kernels/benchmark_lora.py",
     "benchmarks/kernels/benchmark_machete.py",
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 8d04848f8f78..724b39304426 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -27,17 +27,26 @@
     "vllm/*.py",
     "vllm/assets",
     "vllm/distributed",
+    "vllm/engine",
     "vllm/entrypoints",
     "vllm/executor",
     "vllm/inputs",
     "vllm/logging_utils",
     "vllm/multimodal",
     "vllm/platforms",
+    "vllm/plugins",
+    "vllm/tokenizers",
     "vllm/transformers_utils",
     "vllm/triton_utils",
     "vllm/usage",
+    "vllm/utils",
+    "vllm/worker",
     "vllm/v1/core",
     "vllm/v1/engine",
+    "vllm/v1/metrics",
+    "vllm/v1/pool",
+    "vllm/v1/sample",
+    "vllm/v1/worker",
 ]
 
 # After fixing errors resulting from changing follow_imports
@@ -47,26 +56,19 @@
     # v0 related
     "vllm/attention",
     "vllm/compilation",
-    "vllm/engine",
-    "vllm/inputs",
     "vllm/lora",
     "vllm/model_executor",
-    "vllm/plugins",
-    "vllm/worker",
     # v1 related
     "vllm/v1/attention",
     "vllm/v1/executor",
     "vllm/v1/kv_offload",
-    "vllm/v1/metrics",
-    "vllm/v1/pool",
-    "vllm/v1/sample",
     "vllm/v1/spec_decode",
     "vllm/v1/structured_output",
-    "vllm/v1/worker",
 ]
 
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 EXCLUDE = [
+    "vllm/engine/arg_utils.py",
     "vllm/model_executor/parallel_utils",
     "vllm/model_executor/models",
     "vllm/model_executor/layers/fla/ops",
diff --git a/tools/vllm-tpu/build.sh b/tools/vllm-tpu/build.sh
index fbc91e379df3..45ef8dfcb1db 100755
--- a/tools/vllm-tpu/build.sh
+++ b/tools/vllm-tpu/build.sh
@@ -7,6 +7,13 @@ TOOLS_DIR=$(cd "$(dirname "$SCRIPT_PATH_PARAM")" && pwd) # Absolute path to the
 REPO_ROOT=$(cd "$TOOLS_DIR/../../" && pwd) # Absolute path to the repo root
 VLLM_DIR="$REPO_ROOT/" # Path to the vllm sources
 
+CHANGE_FILE_LIST=(
+  "vllm/entrypoints/cli/main.py"
+  "vllm/entrypoints/cli/run_batch.py"
+  "vllm/utils/__init__.py"
+  "vllm/platforms/__init__.py"
+)
+
 # Ensure we are not running from within the vllm directory if SCRIPT_PATH_PARAM is relative like "."
 if [ "$TOOLS_DIR" = "$VLLM_DIR" ]; then
     echo "Error: This script should not be run from the vllm directory directly if using relative paths."
@@ -30,6 +37,20 @@ if ! grep -q "name = \"vllm-tpu\"" "$PYPROJECT_FILE"; then
     echo "Patching pyproject.toml project name to vllm-tpu..."
     cp "$PYPROJECT_FILE" "${PYPROJECT_FILE}.bak"
     sed -i '0,/^name = "vllm"/s//name = "vllm-tpu"/' "$PYPROJECT_FILE"
+
+    echo "Patching ${CHANGE_FILE_LIST[@]} vllm to vllm-tpu..."
+    # patching
+    #   importlib.metadata.version('vllm') -> importlib.metadata.version('vllm-tpu')
+    #   importlib.metadata.version("vllm") -> importlib.metadata.version("vllm-tpu")
+    #   importlib.metadata.metadata('vllm') -> importlib.metadata.metadata('vllm-tpu')
+    #   importlib.metadata.metadata("vllm") -> importlib.metadata.metadata("vllm-tpu")
+    #   version('vllm') -> version('vllm-tpu')
+    #   version("vllm") -> version("vllm-tpu")
+    sed -i \
+        -e "s/importlib.metadata.version(\(['\"]\)vllm\1)/importlib.metadata.version(\1vllm-tpu\1)/" \
+        -e "s/importlib.metadata.metadata(\(['\"]\)vllm\1)/importlib.metadata.metadata(\1vllm-tpu\1)/" \
+        -e "s/version(\(['\"]\)vllm\1)/version(\1vllm-tpu\1)/" \
+        "${CHANGE_FILE_LIST[@]}"
     PATCHED=true
 else
     PATCHED=false
@@ -45,6 +66,13 @@ cleanup() {
         echo "Restoring original pyproject.toml..."
         cp "${PYPROJECT_FILE}.bak" "$PYPROJECT_FILE"
         rm -f "${PYPROJECT_FILE}.bak"
+
+        echo "Restoring vllm code..."
+        sed -i \
+            -e "s/importlib.metadata.version(\(['\"]\)vllm-tpu\1)/importlib.metadata.version(\1vllm\1)/" \
+            -e "s/importlib.metadata.metadata(\(['\"]\)vllm-tpu\1)/importlib.metadata.metadata(\1vllm\1)/" \
+            -e "s/version(\(['\"]\)vllm-tpu\1)/version(\1vllm\1)/" \
+            "${CHANGE_FILE_LIST[@]}"
     fi
 }
 trap cleanup EXIT HUP INT QUIT PIPE TERM # Register cleanup function to run on script exit and various signals
diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py
index 5508e59bcd2f..c6c1cd960bd5 100644
--- a/vllm/_aiter_ops.py
+++ b/vllm/_aiter_ops.py
@@ -30,7 +30,7 @@ def if_aiter_supported(func: Callable) -> Callable:
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
-        # checks the platform, device arch and aiter library existance.
+        # checks the platform, device arch and aiter library existence.
 
         if current_platform.is_rocm() and IS_AITER_FOUND:
             from vllm.platforms.rocm import on_gfx9
@@ -43,6 +43,36 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def _rocm_aiter_group_fp8_quant_impl(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size"
+    from aiter import QuantType, dtypes, get_hip_quant
+
+    aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
+    return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8)
+
+
+def _rocm_aiter_group_fp8_quant_fake(
+    x: torch.Tensor,
+    group_size: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    from aiter import dtypes
+
+    M, N = x.shape
+    x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device)
+    out_bs = torch.empty(
+        (
+            M,
+            (N + group_size - 1) // group_size,
+        ),
+        dtype=torch.float32,
+        device=x.device,
+    )
+    return x_fp8, out_bs
+
+
 def _rocm_aiter_fused_moe_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
@@ -264,6 +294,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     from aiter.mla import mla_decode_fwd
 
@@ -278,6 +310,8 @@ def _rocm_aiter_mla_decode_fwd_impl(
         max_seqlen_qo,
         sm_scale=sm_scale,
         logit_cap=logit_cap,
+        q_scale=q_scale,
+        kv_scale=kv_scale,
     )
 
 
@@ -292,6 +326,8 @@ def _rocm_aiter_mla_decode_fwd_fake(
     kv_last_page_lens: torch.Tensor | None = None,
     sm_scale: float = 1.0,
     logit_cap: float = 0.0,
+    q_scale: torch.Tensor | None = None,
+    kv_scale: torch.Tensor | None = None,
 ) -> None:
     pass
 
@@ -402,6 +438,42 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake(
     return torch.empty_like(x), torch.empty_like(residual)
 
 
+def _rocm_aiter_gemm_a8w8_bpreshuffle_impl(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+    scale_a: torch.Tensor | None = None,
+    scale_b: torch.Tensor | None = None,
+) -> torch.Tensor:
+    # This AITER function can be used for
+    # - per-token activations + per-channel weights
+    # accept the weight as # keep the weight as (N, K)
+    # NOTE: The weight has to be shuffled in the
+    # process_weights_after_loading of the CompressedTensorsW8A8Fp8 class
+
+    from aiter import gemm_a8w8_bpreshuffle_ck
+
+    m = input.shape[0]
+    n = weight.shape[0]
+    Y = torch.empty(m, n, dtype=out_dtype, device=input.device)
+    gemm_a8w8_bpreshuffle_ck(input, weight, scale_a, scale_b, Y)
+    return Y
+
+
+def _rocm_aiter_gemm_a8w8_bpreshuffle_fake(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    out_dtype: torch.dtype | None = None,
+    scale_a: torch.Tensor | None = None,
+    scale_b: torch.Tensor | None = None,
+) -> torch.Tensor:
+    m = input.shape[0]
+    n = weight.shape[0]
+    if out_dtype is None:
+        out_dtype = input.dtype
+    return torch.empty((m, n), dtype=out_dtype, device=input.device)
+
+
 # Global flag to ensure ops are registered only once
 _OPS_REGISTERED = False
 
@@ -512,6 +584,14 @@ def register_ops_once() -> None:
             )
 
             # register all the custom ops here
+            direct_register_custom_op(
+                op_name="rocm_aiter_group_fp8_quant",
+                op_func=_rocm_aiter_group_fp8_quant_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_group_fp8_quant_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             direct_register_custom_op(
                 op_name="rocm_aiter_asm_moe_tkw1",
                 op_func=_rocm_aiter_asm_moe_tkw1_impl,
@@ -592,6 +672,14 @@ def register_ops_once() -> None:
                 dispatch_key=current_platform.dispatch_key,
             )
 
+            direct_register_custom_op(
+                op_name="rocm_aiter_gemm_a8w8_bpreshuffle",
+                op_func=_rocm_aiter_gemm_a8w8_bpreshuffle_impl,
+                mutates_args=[],
+                fake_impl=_rocm_aiter_gemm_a8w8_bpreshuffle_fake,
+                dispatch_key=current_platform.dispatch_key,
+            )
+
             _OPS_REGISTERED = True
 
     @staticmethod
@@ -635,6 +723,18 @@ def gemm_a8w8_blockscale(
             A, B, As, Bs, output_dtype
         )
 
+    @staticmethod
+    def gemm_a8w8_bpreshuffle(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        out_dtype: torch.dtype | None = None,
+        scale_a: torch.Tensor | None = None,
+        scale_b: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return torch.ops.vllm.rocm_aiter_gemm_a8w8_bpreshuffle(
+            input, weight, out_dtype, scale_a, scale_b
+        )
+
     @staticmethod
     def fused_moe(
         hidden_states: torch.Tensor,
@@ -768,6 +868,8 @@ def mla_decode_fwd(
         kv_indices: torch.Tensor | None = None,
         kv_last_page_lens: torch.Tensor | None = None,
         logit_cap: float = 0.0,
+        q_scale: torch.Tensor | None = None,
+        kv_scale: torch.Tensor | None = None,
     ):
         torch.ops.vllm.rocm_aiter_mla_decode_fwd(
             q,
@@ -780,6 +882,8 @@ def mla_decode_fwd(
             kv_last_page_lens,
             sm_scale=sm_scale,
             logit_cap=logit_cap,
+            q_scale=q_scale,
+            kv_scale=kv_scale,
         )
 
     @staticmethod
@@ -887,14 +991,12 @@ def triton_gemm_a8w8_blockscale(
         return gemm_a8w8_blockscale(A, B, As, Bs, dtype=output_dtype)
 
     @staticmethod
-    def per_1x128_fp8_quant(
+    def group_fp8_quant(
         input_2d: torch.Tensor,
+        group_size: int = 128,
     ) -> tuple[torch.Tensor, ...]:
-        """Only applies quantization method for fp8 data type only."""
-        from aiter import QuantType, dtypes, get_hip_quant
-
-        aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128)
-        return aiter_per1x128_quant(input_2d.contiguous(), quant_dtype=dtypes.fp8)
+        assert group_size == 128, "Group size must be 128"
+        return torch.ops.vllm.rocm_aiter_group_fp8_quant(input_2d, group_size)
 
     @staticmethod
     def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool:
@@ -912,6 +1014,31 @@ def is_triton_gemm_w8a8_tuned(n: int, k: int) -> bool:
             (8192, 32768),
         ]
 
+    @staticmethod
+    def is_triton_gemm_afp4wfp4_presh_ws_tuned(n: int, k: int) -> bool:
+        return (n, k) in [
+            (8192, 4096),
+            (1280, 8192),
+            (16384, 53248),
+            (106496, 16384),
+            (57344, 8192),
+            (8192, 2048),
+            (2560, 8192),
+            (10240, 8192),
+            (16384, 16384),
+            (8192, 28672),
+            (28672, 8192),
+            (18432, 16384),
+            (8192, 1024),
+            (7168, 8192),
+            (5120, 8192),
+            (8192, 8192),
+            (8192, 7168),
+            (14336, 8192),
+            (8192, 14336),
+            (8192, 3584),
+        ]
+
     @staticmethod
     def shuffle_weight(
         self, tensor: torch.Tensor, layout: tuple[int, int] = (16, 16)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 7d70c01cefbb..e60158898685 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -328,10 +328,7 @@ def rotary_embedding(
 def rms_norm(
     out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float
 ) -> None:
-    # TODO: Remove this contiguous call when the kernel is updated to support non-contiguous input
-    # If removed, also need to remove contiguous in MatcherRMSNorm
-    input_contiguous = input.contiguous()
-    torch.ops._C.rms_norm(out, input_contiguous, weight, epsilon)
+    torch.ops._C.rms_norm(out, input, weight, epsilon)
 
 
 def fused_add_rms_norm(
@@ -557,6 +554,7 @@ def _gptq_marlin_gemm_fake(
         b_q_weight: torch.Tensor,
         b_bias: torch.Tensor | None,
         b_scales: torch.Tensor,
+        a_scales: torch.Tensor | None,
         global_scale: torch.Tensor | None,
         b_zeros: torch.Tensor | None,
         g_idx: torch.Tensor | None,
@@ -571,7 +569,10 @@ def _gptq_marlin_gemm_fake(
         use_fp32_reduce: bool = False,
         is_zp_float: bool = False,
     ) -> torch.Tensor:
-        return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype)
+        dtype = a.dtype
+        if dtype not in [torch.half, torch.bfloat16]:
+            dtype = b_scales.dtype
+        return torch.empty((size_m, size_n), device=a.device, dtype=dtype)
 
     @register_fake("_C::awq_dequantize")
     def _awq_dequantize_fake(
@@ -1170,15 +1171,63 @@ def gptq_marlin_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
-    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits)
+    return torch.ops._C.gptq_marlin_repack(
+        b_q_weight, perm, size_k, size_n, num_bits, is_a_8bit
+    )
 
 
-# gptq_marlin
+if hasattr(torch.ops._C, "gptq_marlin_repack"):
+
+    @register_fake("_C::gptq_marlin_repack")
+    def _gptq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        perm: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+        is_a_8bit: bool = False,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
+
+
+# awq_marlin
 def awq_marlin_repack(
-    b_q_weight: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    b_q_weight: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
-    return torch.ops._C.awq_marlin_repack(b_q_weight, size_k, size_n, num_bits)
+    return torch.ops._C.awq_marlin_repack(
+        b_q_weight, size_k, size_n, num_bits, is_a_8bit
+    )
+
+
+if hasattr(torch.ops._C, "awq_marlin_repack"):
+
+    @register_fake("_C::awq_marlin_repack")
+    def _awq_marlin_repack_fake(
+        b_q_weight: torch.Tensor,
+        size_k: torch.SymInt,
+        size_n: torch.SymInt,
+        num_bits: int,
+        is_a_8bit: bool = False,
+    ) -> torch.Tensor:
+        pack_factor = 32 // num_bits
+        marlin_tile_size = 16
+        return torch.empty(
+            (size_k // marlin_tile_size, size_n * marlin_tile_size // pack_factor),
+            dtype=b_q_weight.dtype,
+            device=b_q_weight.device,
+        )
 
 
 def gptq_marlin_moe_repack(
@@ -1187,6 +1236,7 @@ def gptq_marlin_moe_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
@@ -1197,7 +1247,7 @@ def gptq_marlin_moe_repack(
     )
     for e in range(num_experts):
         output[e] = torch.ops._C.gptq_marlin_repack(
-            b_q_weight[e], perm[e], size_k, size_n, num_bits
+            b_q_weight[e], perm[e], size_k, size_n, num_bits, is_a_8bit
         )
     return output
 
@@ -1208,6 +1258,7 @@ def awq_marlin_moe_repack(
     size_k: int,
     size_n: int,
     num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     num_experts = b_q_weight.shape[0]
     assert size_k % 16 == 0
@@ -1218,17 +1269,26 @@ def awq_marlin_moe_repack(
     )
     for e in range(num_experts):
         output[e] = torch.ops._C.awq_marlin_repack(
-            b_q_weight[e], size_k, size_n, num_bits
+            b_q_weight[e], size_k, size_n, num_bits, is_a_8bit
         )
     return output
 
 
+def marlin_int4_fp8_preprocess(
+    qweight: torch.Tensor,
+    qzeros_or_none: torch.Tensor | None = None,
+    inplace: bool = False,
+):
+    return torch.ops._C.marlin_int4_fp8_preprocess(qweight, qzeros_or_none, inplace)
+
+
 def gptq_marlin_gemm(
     a: torch.Tensor,
     c: torch.Tensor | None,
     b_q_weight: torch.Tensor,
     b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
+    a_scales: torch.Tensor | None,
     global_scale: torch.Tensor | None,
     b_zeros: torch.Tensor | None,
     g_idx: torch.Tensor | None,
@@ -1249,6 +1309,7 @@ def gptq_marlin_gemm(
         b_q_weight,
         b_bias,
         b_scales,
+        a_scales,
         global_scale,
         b_zeros,
         g_idx,
@@ -1566,7 +1627,7 @@ def allspark_repack_weight(
             if use asymmetric quantization, has_zp = True.
 
     Returns:
-        tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] :
+        tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] :
             rearranged weight, scale, and optionally zero_point.
     """
     K = qweight.shape[0]
@@ -1649,7 +1710,7 @@ def scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
 
     Returns:
-      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+      tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
@@ -1970,6 +2031,7 @@ def moe_wna16_marlin_gemm(
     b_qweight: torch.Tensor,
     b_bias: torch.Tensor | None,
     b_scales: torch.Tensor,
+    a_scales: torch.Tensor | None,
     global_scale: torch.Tensor | None,
     b_qzeros: torch.Tensor | None,
     g_idx: torch.Tensor | None,
@@ -1991,6 +2053,9 @@ def moe_wna16_marlin_gemm(
     use_atomic_add: bool,
     use_fp32_reduce: bool,
     is_zp_float: bool,
+    thread_k: int = -1,
+    thread_n: int = -1,
+    blocks_per_sm: int = -1,
 ) -> torch.Tensor:
     return torch.ops._moe_C.moe_wna16_marlin_gemm(
         input,
@@ -1998,6 +2063,7 @@ def moe_wna16_marlin_gemm(
         b_qweight,
         b_bias,
         b_scales,
+        a_scales,
         global_scale,
         b_qzeros,
         g_idx,
@@ -2019,6 +2085,9 @@ def moe_wna16_marlin_gemm(
         use_atomic_add,
         use_fp32_reduce,
         is_zp_float,
+        thread_k,
+        thread_n,
+        blocks_per_sm,
     )
 
 
@@ -2054,7 +2123,10 @@ def moe_wna16_marlin_gemm_fake(
         input: torch.Tensor,
         output: torch.Tensor | None,
         b_qweight: torch.Tensor,
+        b_bias: torch.Tensor | None,
         b_scales: torch.Tensor,
+        a_scales: torch.Tensor | None,
+        global_scale: torch.Tensor | None,
         b_qzeros: torch.Tensor | None,
         g_idx: torch.Tensor | None,
         perm: torch.Tensor | None,
@@ -2075,7 +2147,7 @@ def moe_wna16_marlin_gemm_fake(
         use_atomic_add: bool,
         use_fp32_reduce: bool,
         is_zp_float: bool,
-    ) -> torch.Tensor:
+    ):
         return torch.empty(
             (size_m * top_k, size_n), dtype=input.dtype, device=input.device
         )
@@ -2167,7 +2239,8 @@ def gather_and_maybe_dequant_cache(
     dst: torch.Tensor,
     block_table: torch.Tensor,
     cu_seq_lens: torch.Tensor,
-    batch_size: int,
+    token_to_seq: torch.Tensor,
+    num_tokens: int,
     kv_cache_dtype: str,
     scale: torch.Tensor,
     seq_starts: torch.Tensor | None = None,
@@ -2177,7 +2250,8 @@ def gather_and_maybe_dequant_cache(
         dst,
         block_table,
         cu_seq_lens,
-        batch_size,
+        token_to_seq,
+        num_tokens,
         kv_cache_dtype,
         scale,
         seq_starts,
@@ -2547,7 +2621,7 @@ def onednn_scaled_int8_quant(
         symmetric: Whether to use symmetric quantization (scale only, azp ignored).
 
     Returns:
-      tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]] : Output int8 tensor, scales, and optionally azp.
+      tuple[torch.Tensor, torch.Tensor, torch.Tensor | None] : Output int8 tensor, scales, and optionally azp.
     """
     output = torch.empty_like(input, dtype=torch.int8)
     token_num = input.numel() // input.shape[-1]
@@ -2665,6 +2739,31 @@ def cpu_attention_with_kv_cache(
     )
 
 
+def cpu_gemm_wna16(
+    input: torch.Tensor,
+    q_weight: torch.Tensor,
+    scales: torch.Tensor,
+    zeros: torch.Tensor | None,
+    g_idx: torch.Tensor | None,
+    bias: torch.Tensor | None,
+    pack_factor: int,
+    isa_hint: str,
+) -> torch.Tensor:
+    output = torch.empty((input.size(0), scales.size(1)), dtype=input.dtype)
+    torch.ops._C.cpu_gemm_wna16(
+        input,
+        q_weight,
+        output,
+        scales,
+        zeros,
+        g_idx,
+        bias,
+        pack_factor,
+        isa_hint,
+    )
+    return output
+
+
 if hasattr(torch.ops._qutlass_C, "matmul_mxf4_bf16_tn"):
 
     @register_fake("_qutlass_C::matmul_mxf4_bf16_tn")
diff --git a/vllm/attention/__init__.py b/vllm/attention/__init__.py
index dd35165d5415..e69de29bb2d1 100644
--- a/vllm/attention/__init__.py
+++ b/vllm/attention/__init__.py
@@ -1,18 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionMetadata,
-    AttentionType,
-)
-from vllm.attention.layer import Attention
-from vllm.attention.selector import get_attn_backend
-
-__all__ = [
-    "Attention",
-    "AttentionBackend",
-    "AttentionMetadata",
-    "AttentionType",
-    "get_attn_backend",
-]
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 697beed91869..c290670eeacb 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -6,11 +6,10 @@
 
 import torch
 
-from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
-
 if TYPE_CHECKING:
     from vllm.config.cache import CacheDType
+    from vllm.model_executor.layers.linear import ColumnParallelLinear
+    from vllm.model_executor.layers.quantization.utils.quant_utils import QuantKey
     from vllm.platforms.interface import DeviceCapability
     from vllm.v1.attention.backends.utils import KVCacheLayoutType
 
@@ -46,9 +45,12 @@ class AttentionBackend(ABC):
     # makes sure the output tensor is allocated inside the cudagraph.
     accept_output_buffer: bool = False
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(1)]
     supported_kv_cache_dtypes: ClassVar[list["CacheDType"]] = ["auto"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(1)]
+
     @staticmethod
     @abstractmethod
     def get_name() -> str:
@@ -76,7 +78,34 @@ def get_kv_cache_shape(
         raise NotImplementedError
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        """
+        Get the physical (memory layout) ordering of the kv cache dimensions.
+        e.g. if the KV cache shape is
+        [2, num_blocks, block_size, num_heads, head_size],
+        and get_kv_cache_stride_order returns (1, 3, 0, 2, 4) then the physical
+        ordering of dimensions is
+        [num_blocks, num_heads, 2, block_size, head_size].
+
+        If this function is unimplemented / raises NotImplementedError,
+        the physical layout of the KV cache will match the logical shape.
+
+        Args:
+            include_num_layers_dimension: if True, includes an additional
+                num_layers dimension, which is assumed to be prepended
+                to the logical KV cache shape.
+                With the above example, a return value (2, 4, 0, 1, 3, 5)
+                corresponds to
+                [num_blocks, num_heads, num_layers, 2, block_size, head_size].
+
+                If an additional dimension is NOT included in the returned
+                tuple, the physical layout will not include a layers dimension.
+
+        Returns:
+            A tuple of ints which is a permutation of range(len(shape)).
+        """
         raise NotImplementedError
 
     @classmethod
@@ -115,18 +144,17 @@ def supports_block_size(cls, block_size: int | None) -> bool:
         if block_size not in valid_sizes:
             return False
 
-        if not cls.supported_kernel_block_sizes:
+        supported_kernel_block_sizes = cls.get_supported_kernel_block_sizes()
+        if not supported_kernel_block_sizes:
             return True
 
-        for supported_size in cls.supported_kernel_block_sizes:
-            is_multiple_of = (
-                isinstance(supported_size, MultipleOf)
-                and block_size % supported_size.base == 0
-            )
-            is_int_equal = (
-                isinstance(supported_size, int) and block_size == supported_size
-            )
-            if is_multiple_of or is_int_equal:
+        for supported_size in supported_kernel_block_sizes:
+            if isinstance(supported_size, MultipleOf):
+                supported_size = supported_size.base
+            # With hybrid_blocks feature, the framework-level block size
+            # only needs to be a multiple of the kernel's requirement,
+            # even if the kernel requires a fixed block_size.
+            if block_size % supported_size == 0:
                 return True
         return False
 
@@ -142,6 +170,15 @@ def supports_sink(cls) -> bool:
     def is_sparse(cls) -> bool:
         return False
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """Check if backend supports a given attention type.
+
+        By default, only supports decoder attention.
+        Backends should override this to support other attention types.
+        """
+        return attn_type == AttentionType.DECODER
+
     @classmethod
     def supports_compute_capability(cls, capability: "DeviceCapability") -> bool:
         return True
@@ -171,6 +208,7 @@ def validate_configuration(
         has_sink: bool,
         use_sparse: bool,
         device_capability: "DeviceCapability",
+        attn_type: str,
     ) -> list[str]:
         invalid_reasons = []
         if not cls.supports_head_size(head_size):
@@ -195,6 +233,8 @@ def validate_configuration(
                 invalid_reasons.append("non-sparse not supported")
         if not cls.supports_compute_capability(device_capability):
             invalid_reasons.append("compute capability not supported")
+        if not cls.supports_attn_type(attn_type):
+            invalid_reasons.append(f"attention type {attn_type} not supported")
         combination_reason = cls.supports_combination(
             head_size,
             dtype,
@@ -252,6 +292,12 @@ class AttentionImpl(ABC, Generic[T]):
     dcp_world_size: int
     dcp_rank: int
 
+    pcp_world_size: int
+    pcp_rank: int
+
+    total_cp_world_size: int
+    total_cp_rank: int
+
     def __new__(cls, *args, **kwargs):
         # use __new__ so that all subclasses will call this
         self = super().__new__(cls)
@@ -264,6 +310,17 @@ def __new__(cls, *args, **kwargs):
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
+        try:
+            from vllm.distributed.parallel_state import get_pcp_group
+
+            self.pcp_world_size = get_pcp_group().world_size
+            self.pcp_rank = get_pcp_group().rank_in_group
+        except AssertionError:
+            self.pcp_world_size = 1
+            self.pcp_rank = 0
+        self.total_cp_world_size = self.pcp_world_size * self.dcp_world_size
+        self.total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank
+
         self.need_to_return_lse_for_decode = (
             self.dcp_world_size > 1 and self.can_return_lse_for_decode
         )
@@ -300,7 +357,7 @@ def forward(
     ) -> torch.Tensor:
         raise NotImplementedError
 
-    def fused_output_quant_supported(self, quant_key: QuantKey):
+    def fused_output_quant_supported(self, quant_key: "QuantKey"):
         """
         Does this attention implementation support fused output quantization.
         This is used by the AttnFusionPass to only fuse output quantization
@@ -352,7 +409,7 @@ def __init__(
         qk_rope_head_dim: int,
         qk_head_dim: int,
         v_head_dim: int,
-        kv_b_proj: ColumnParallelLinear,
+        kv_b_proj: "ColumnParallelLinear",
         indexer: object | None = None,
     ) -> None:
         raise NotImplementedError
diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py
index f07a6059be37..125e4e382774 100644
--- a/vllm/attention/backends/registry.py
+++ b/vllm/attention/backends/registry.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention backend registry"""
 
-import enum
 from collections.abc import Callable
+from enum import Enum, EnumMeta
 from typing import TYPE_CHECKING, cast
 
 from vllm.logger import init_logger
@@ -15,7 +15,7 @@
 logger = init_logger(__name__)
 
 
-class _AttentionBackendEnumMeta(enum.EnumMeta):
+class _AttentionBackendEnumMeta(EnumMeta):
     """Metaclass for AttentionBackendEnum to provide better error messages."""
 
     def __getitem__(cls, name: str):
@@ -23,15 +23,15 @@ def __getitem__(cls, name: str):
         try:
             return super().__getitem__(name)
         except KeyError:
-            members = cast("dict[str, AttentionBackendEnum]", cls.__members__).values()
-            valid_backends = ", ".join(m.name for m in members)
+            members = cast("dict[str, Enum]", cls.__members__).keys()
+            valid_backends = ", ".join(members)
             raise ValueError(
                 f"Unknown attention backend: '{name}'. "
                 f"Valid options are: {valid_backends}"
             ) from None
 
 
-class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
+class AttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
     """Enumeration of all supported attention backends.
 
     The enum value is the default class path, but this can be overridden
@@ -43,12 +43,17 @@ class AttentionBackendEnum(enum.Enum, metaclass=_AttentionBackendEnumMeta):
 
     FLASH_ATTN = "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
     TRITON_ATTN = "vllm.v1.attention.backends.triton_attn.TritonAttentionBackend"
-    XFORMERS = "vllm.v1.attention.backends.xformers.XFormersAttentionBackend"
     ROCM_ATTN = "vllm.v1.attention.backends.rocm_attn.RocmAttentionBackend"
     ROCM_AITER_MLA = "vllm.v1.attention.backends.mla.rocm_aiter_mla.AiterMLABackend"
+    ROCM_AITER_TRITON_MLA = (
+        "vllm.v1.attention.backends.mla.aiter_triton_mla.AiterTritonMLABackend"
+    )
     ROCM_AITER_FA = (
         "vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
     )
+    ROCM_AITER_MLA_SPARSE = (
+        "vllm.v1.attention.backends.mla.rocm_aiter_mla_sparse.ROCMAiterMLASparseBackend"
+    )
     TORCH_SDPA = ""  # this tag is only used for ViT
     FLASHINFER = "vllm.v1.attention.backends.flashinfer.FlashInferBackend"
     FLASHINFER_MLA = (
@@ -83,7 +88,69 @@ def get_path(self, include_classname: bool = True) -> str:
         Raises:
             ValueError: If Backend.CUSTOM is used without being registered
         """
-        path = _OVERRIDES.get(self, self.value)
+        path = _ATTN_OVERRIDES.get(self, self.value)
+        if not path:
+            raise ValueError(
+                f"Backend {self.name} must be registered before use. "
+                f"Use register_backend(Backend.{self.name}, 'your.module.YourClass')"
+            )
+        if not include_classname:
+            path = path.rsplit(".", 1)[0]
+        return path
+
+    def get_class(self) -> "type[AttentionBackend]":
+        """Get the backend class (respects overrides).
+
+        Returns:
+            The backend class
+
+        Raises:
+            ImportError: If the backend class cannot be imported
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        return resolve_obj_by_qualname(self.get_path())
+
+    def is_overridden(self) -> bool:
+        """Check if this backend has been overridden.
+
+        Returns:
+            True if the backend has a registered override
+        """
+        return self in _ATTN_OVERRIDES
+
+    def clear_override(self) -> None:
+        """Clear any override for this backend, reverting to the default."""
+        _ATTN_OVERRIDES.pop(self, None)
+
+
+class MambaAttentionBackendEnum(Enum, metaclass=_AttentionBackendEnumMeta):
+    """Enumeration of all supported mamba attention backends.
+
+    The enum value is the default class path, but this can be overridden
+    at runtime using register_backend().
+
+    To get the actual backend class (respecting overrides), use:
+        backend.get_class()
+    """
+
+    MAMBA1 = "vllm.v1.attention.backends.mamba1_attn.Mamba1AttentionBackend"
+    MAMBA2 = "vllm.v1.attention.backends.mamba2_attn.Mamba2AttentionBackend"
+    SHORT_CONV = "vllm.v1.attention.backends.short_conv_attn.ShortConvAttentionBackend"
+    LINEAR = "vllm.v1.attention.backends.linear_attn.LinearAttentionBackend"
+    GDN_ATTN = "vllm.v1.attention.backends.gdn_attn.GDNAttentionBackend"
+    # Placeholder for third-party/custom backends - must be registered before use
+    CUSTOM = ""
+
+    def get_path(self, include_classname: bool = True) -> str:
+        """Get the class path for this backend (respects overrides).
+
+        Returns:
+            The fully qualified class path string
+
+        Raises:
+            ValueError: If Backend.CUSTOM is used without being registered
+        """
+        path = _MAMBA_ATTN_OVERRIDES.get(self, self.value)
         if not path:
             raise ValueError(
                 f"Backend {self.name} must be registered before use. "
@@ -111,18 +178,31 @@ def is_overridden(self) -> bool:
         Returns:
             True if the backend has a registered override
         """
-        return self in _OVERRIDES
+        return self in _MAMBA_ATTN_OVERRIDES
 
     def clear_override(self) -> None:
         """Clear any override for this backend, reverting to the default."""
-        _OVERRIDES.pop(self, None)
+        _MAMBA_ATTN_OVERRIDES.pop(self, None)
+
 
+MAMBA_TYPE_TO_BACKEND_MAP = {
+    "mamba1": MambaAttentionBackendEnum.MAMBA1.name,
+    "mamba2": MambaAttentionBackendEnum.MAMBA2.name,
+    "short_conv": MambaAttentionBackendEnum.SHORT_CONV.name,
+    "linear_attention": MambaAttentionBackendEnum.LINEAR.name,
+    "gdn_attention": MambaAttentionBackendEnum.GDN_ATTN.name,
+    "custom": MambaAttentionBackendEnum.CUSTOM.name,
+}
 
-_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+
+_ATTN_OVERRIDES: dict[AttentionBackendEnum, str] = {}
+_MAMBA_ATTN_OVERRIDES: dict[MambaAttentionBackendEnum, str] = {}
 
 
 def register_backend(
-    backend: AttentionBackendEnum, class_path: str | None = None
+    backend: AttentionBackendEnum | MambaAttentionBackendEnum,
+    is_mamba: bool = False,
+    class_path: str | None = None,
 ) -> Callable[[type], type]:
     """Register or override a backend implementation.
 
@@ -135,12 +215,17 @@ def register_backend(
         Decorator function if class_path is None, otherwise a no-op
 
     Examples:
-        # Override an existing backend
+        # Override an existing attention backend
         @register_backend(AttentionBackendEnum.FLASH_ATTN)
         class MyCustomFlashAttn:
             ...
 
-        # Register a custom third-party backend
+        # Override an existing mamba attention backend
+        @register_backend(MambaAttentionBackendEnum.LINEAR, is_mamba=True)
+        class MyCustomMambaAttn:
+            ...
+
+        # Register a custom third-party attention backend
         @register_backend(AttentionBackendEnum.CUSTOM)
         class MyCustomBackend:
             ...
@@ -153,11 +238,17 @@ class MyCustomBackend:
     """
 
     def decorator(cls: type) -> type:
-        _OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"
+        if is_mamba:
+            _MAMBA_ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"  # type: ignore[index]
+        else:
+            _ATTN_OVERRIDES[backend] = f"{cls.__module__}.{cls.__qualname__}"  # type: ignore[index]
         return cls
 
     if class_path is not None:
-        _OVERRIDES[backend] = class_path
+        if is_mamba:
+            _MAMBA_ATTN_OVERRIDES[backend] = class_path  # type: ignore[index]
+        else:
+            _ATTN_OVERRIDES[backend] = class_path  # type: ignore[index]
         return lambda x: x
 
     return decorator
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 487bba76babf..da5a62617129 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -10,8 +10,11 @@
 import torch.nn.functional as F
 
 import vllm.envs as envs
-from vllm.attention import AttentionType
-from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionType,
+    MLAAttentionImpl,
+)
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
@@ -51,80 +54,30 @@
 
 FP8_DTYPE = current_platform.fp8_dtype()
 logger = init_logger(__name__)
-USE_XFORMERS_OPS = None
-
-
-def check_xformers_availability():
-    global USE_XFORMERS_OPS
-    if USE_XFORMERS_OPS is not None:
-        return USE_XFORMERS_OPS
-
-    if current_platform.is_cuda() and current_platform.has_device_capability(100):
-        # Xformers FA is not compatible with B200
-        USE_XFORMERS_OPS = False
-    else:
-        try:
-            from importlib.util import find_spec
-
-            find_spec("xformers.ops")
-            USE_XFORMERS_OPS = True
-        except ImportError:
-            USE_XFORMERS_OPS = False
-
-    # the warning only needs to be shown once
-    if not USE_XFORMERS_OPS:
-        logger.warning("Xformers is not available, falling back.")
-
-    return USE_XFORMERS_OPS
-
-
-def check_upstream_fa_availability(dtype: torch.dtype):
-    if (
-        dtype in (torch.float16, torch.bfloat16)
-        and current_platform.is_cuda()
-        and current_platform.has_device_capability(80)
-    ):
-        from transformers.utils import is_flash_attn_2_available
-
-        return is_flash_attn_2_available()
-    if current_platform.is_rocm():
-        from importlib.util import find_spec
-
-        return find_spec("flash_attn") is not None
-    return False
 
 
 def maybe_get_vit_flash_attn_backend(
     attn_backend: AttentionBackendEnum,
-    use_upstream_fa: bool,
     attn_backend_override: AttentionBackendEnum | None = None,
 ) -> tuple[AttentionBackendEnum, Callable | None]:
     if current_platform.is_rocm():
         if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
             attn_backend = AttentionBackendEnum.ROCM_AITER_FA
-
         elif (
-            check_upstream_fa_availability(torch.get_default_dtype())
+            attn_backend_override is None
             and on_gfx9()
-            and attn_backend_override is None
+            and attn_backend == AttentionBackendEnum.FLASH_ATTN
         ):
-            attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
+            pass
         else:
             return AttentionBackendEnum.TORCH_SDPA, None
-
     elif current_platform.is_cuda():
-        if (
-            attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
+        pass
     elif current_platform.is_xpu():
         assert attn_backend == AttentionBackendEnum.FLASH_ATTN, (
             "XPU platform only supports FLASH_ATTN as vision attention backend."
         )
-        use_upstream_fa = False
+        pass
     else:
         return AttentionBackendEnum.TORCH_SDPA, None
 
@@ -135,10 +88,7 @@ def maybe_get_vit_flash_attn_backend(
         if attn_backend == AttentionBackendEnum.ROCM_AITER_FA:
             from aiter import flash_attn_varlen_func
         else:
-            if use_upstream_fa:
-                from flash_attn import flash_attn_varlen_func
-            else:
-                from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
     else:
         flash_attn_varlen_func = None
 
@@ -291,6 +241,7 @@ def __init__(
                 block_size,
                 use_mla=False,
                 has_sink=self.has_sink,
+                attn_type=attn_type,
             )
         else:
             self.attn_backend = attn_backend
@@ -309,7 +260,8 @@ def __init__(
             kv_sharing_target_layer_name,
             **extra_impl_args,
         )
-        self.backend = AttentionBackendEnum[self.attn_backend.get_name()]
+        backend_name = self.attn_backend.get_name()
+        self.backend = AttentionBackendEnum.__members__.get(backend_name)
         self.dtype = dtype
 
         # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how
@@ -521,17 +473,11 @@ def __init__(
             attn_backend_override=attn_backend_override,
         )
 
-        # Some auto-selected backends can be upgraded
-        # to upstream flash attention if available.
-        # If vllm native fa is selected, we use it directly.
-        use_upstream_fa = False
-
         self.attn_backend = (
             backend
             if backend
             in {
                 AttentionBackendEnum.TORCH_SDPA,
-                AttentionBackendEnum.XFORMERS,
                 AttentionBackendEnum.PALLAS,
                 AttentionBackendEnum.ROCM_AITER_FA,
                 AttentionBackendEnum.FLASH_ATTN,
@@ -542,33 +488,17 @@ def __init__(
         self.attn_backend, self._flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
 
-        if (
-            self.attn_backend == AttentionBackendEnum.XFORMERS
-            and not check_xformers_availability()
-        ):
-            self.attn_backend = AttentionBackendEnum.TORCH_SDPA
-
         self.is_flash_attn_backend = self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-        # this condition is just to make sure that the
-        # use_upstream_fa in the log is correct
-        if (
-            current_platform.is_rocm()
-            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-        ):
-            use_upstream_fa = True
-
         logger.info_once(
-            f"MultiHeadAttention attn_backend: {self.attn_backend}, "
-            f"use_upstream_fa: {use_upstream_fa}"
+            f"Using {self.attn_backend} for MultiHeadAttention in multimodal encoder."
         )
 
     def forward(
@@ -612,12 +542,6 @@ def forward(
                 max_seqlen_k=kv_len,
                 softmax_scale=self.scale,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-
-            out = xops.memory_efficient_attention_forward(
-                query, key, value, scale=self.scale
-            )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             query, key, value = (x.transpose(1, 2) for x in (query, key, value))
             out = F.scaled_dot_product_attention(query, key, value, scale=self.scale)
diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py
index f144e8435b6c..0ced0028ded9 100644
--- a/vllm/attention/layers/chunked_local_attention.py
+++ b/vllm/attention/layers/chunked_local_attention.py
@@ -1,24 +1,27 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import functools
-from typing import ClassVar
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import CacheConfig
 from vllm.config.vllm import VllmConfig
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
+    AttentionMetadataBuilder,
     CommonAttentionMetadata,
     make_local_attention_virtual_batches,
     subclass_attention_backend,
 )
-from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, KVCacheSpec
-
-from ..layer import Attention
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    ChunkedLocalAttentionSpec,
+    KVCacheSpec,
+)
 
 
 @functools.lru_cache
@@ -30,9 +33,18 @@ def create_chunked_local_attention_backend(
     prefix = f"ChunkedLocalAttention_{attention_chunk_size}_{block_size}_"
 
     underlying_builder = underlying_attn_backend.get_builder_cls()
+    assert issubclass(underlying_builder, AttentionMetadataBuilder)
 
     class ChunkedLocalAttentionBuilder(underlying_builder):  # type: ignore
-        _cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+        @classmethod
+        def get_cudagraph_support(
+            cls: type["AttentionMetadataBuilder"],
+            vllm_config: VllmConfig,
+            kv_cache_spec: AttentionSpec,
+        ) -> AttentionCGSupport:
+            # Explicit override in case the underlying builder specialized this getter.
+            # @override omitted only because of mypy limitation due to type variable.
+            return AttentionCGSupport.NEVER
 
         def build(
             self,
diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py
index 5b44c7e3e7ec..068fd0a0eb7d 100644
--- a/vllm/attention/layers/cross_attention.py
+++ b/vllm/attention/layers/cross_attention.py
@@ -25,15 +25,6 @@
 logger = init_logger(__name__)
 
 
-def _get_max_encoder_len(vllm_config: "VllmConfig") -> int:
-    """Gets the max number of encoder input tokens from the config."""
-    sc = vllm_config.scheduler_config
-    assert sc and isinstance(sc.max_num_encoder_input_tokens, int), (
-        "max_num_encoder_input_tokens must be int for enc-dec models"
-    )
-    return sc.max_num_encoder_input_tokens
-
-
 def _get_cross_slot_mapping(
     encoder_seq_lens: np.ndarray,
     block_table_tensor: torch.Tensor,
@@ -93,23 +84,32 @@ def build(
         ) -> AttentionMetadata:
             new_metadata = copy(common_attn_metadata)
             new_metadata.causal = False
-            max_encoder_len = _get_max_encoder_len(self.vllm_config)
+            max_encoder_len = int(new_metadata.encoder_seq_lens_cpu.max())
             new_metadata.max_seq_len = max_encoder_len
-
-            new_metadata.seq_lens = torch.full(
-                (new_metadata.num_reqs,),
-                max_encoder_len,
-                dtype=torch.int32,
-                device=self.device,
+            # Any computed tokens indicated decode step>1 (no chunked prefill)
+            num_cache_decodes = (
+                (common_attn_metadata.num_computed_tokens_cpu > 0).sum().item()
             )
-            new_metadata.seq_lens_cpu = torch.full(
-                (new_metadata.num_reqs,),
-                max_encoder_len,
-                dtype=torch.int32,
-                device="cpu",
+            if num_cache_decodes > 0:
+                # CrossAttn KV cache has already been populated on first decoder step,
+                # skip slot_mapping calculation for requests that do not need
+                # reshape_and_cache.
+                num_tokens = common_attn_metadata.num_computed_tokens_cpu.numpy()
+                new_metadata.encoder_seq_lens_cpu = np.where(
+                    num_tokens > 0, 0, new_metadata.encoder_seq_lens_cpu
+                )
+
+            # seq_lens is provided by model runner: initial encoder input length is
+            # needed here to know how many tokens to attend to from the cached
+            # cross-attention KV cache.
+            new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens
+            new_metadata.seq_lens_cpu = torch.from_numpy(
+                common_attn_metadata.encoder_seq_lens_cpu
             )
+
+            # NOTE (NickLucche) use `new_metadata` instead of `common_*` (initial) here
             new_metadata.slot_mapping = _get_cross_slot_mapping(
-                new_metadata.encoder_seq_lens,
+                new_metadata.encoder_seq_lens_cpu,
                 new_metadata.block_table_tensor,
                 self.kv_cache_spec,
                 self.device,
diff --git a/vllm/attention/layers/encoder_only_attention.py b/vllm/attention/layers/encoder_only_attention.py
index 4929bbf5efc7..5e99c9901003 100644
--- a/vllm/attention/layers/encoder_only_attention.py
+++ b/vllm/attention/layers/encoder_only_attention.py
@@ -74,7 +74,11 @@ def __init__(
             block_size = 16
 
         underlying_attn_backend = get_attn_backend(
-            head_size, dtype, kv_cache_dtype, block_size
+            head_size,
+            dtype,
+            kv_cache_dtype,
+            block_size,
+            attn_type=AttentionType.ENCODER_ONLY,
         )
 
         attn_backend = create_encoder_only_attention_backend(underlying_attn_backend)
diff --git a/vllm/attention/ops/common.py b/vllm/attention/ops/common.py
index 2cbb5c91cc3b..bd6bc864d45d 100644
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -21,6 +21,7 @@ def _correct_attn_cp_out_kernel(
     lse_idx,
     HEAD_DIM: tl.constexpr,
     N_ROUNDED: tl.constexpr,
+    IS_BASE_E: tl.constexpr,
 ):
     """
     Apply the all-gathered lses to correct each local rank's attention
@@ -55,9 +56,14 @@ def _correct_attn_cp_out_kernel(
     lse_max = tl.max(lse, axis=0)
     lse_max = tl.where(lse_max == -float("inf"), 0, lse_max)
     lse -= lse_max
-    lse_exp = tl.exp(lse)
-    lse_acc = tl.sum(lse_exp, axis=0)
-    lse = tl.log(lse_acc)
+    if IS_BASE_E:
+        lse_exp = tl.exp(lse)
+        lse_acc = tl.sum(lse_exp, axis=0)
+        lse = tl.log(lse_acc)
+    else:
+        lse_exp = tl.exp2(lse)
+        lse_acc = tl.sum(lse_exp, axis=0)
+        lse = tl.log2(lse_acc)
     lse += lse_max
 
     lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
@@ -81,7 +87,7 @@ def _correct_attn_cp_out_kernel(
         -float("inf"),
         lse_finally,
     )
-    factor = tl.exp(lse_finally)
+    factor = tl.exp(lse_finally) if IS_BASE_E else tl.exp2(lse_finally)
     output = tl.load(outputs_ptr + output_offsets)
     output = output * factor
 
@@ -102,7 +108,11 @@ def call_kernel(self, kernel, grid, *regular_args, **const_args):
 
 
 def correct_attn_out(
-    out: torch.Tensor, lses: torch.Tensor, cp_rank: int, ctx: CPTritonContext
+    out: torch.Tensor,
+    lses: torch.Tensor,
+    cp_rank: int,
+    ctx: CPTritonContext,
+    is_lse_base_on_e: bool = True,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Correct the attention output using the all-gathered lses.
 
@@ -163,18 +173,17 @@ def correct_attn_out(
         l_sH,
         cp_rank,
     )
-    const_args = {"HEAD_DIM": D, "N_ROUNDED": N}
-
+    const_args = {"HEAD_DIM": D, "N_ROUNDED": N, "IS_BASE_E": is_lse_base_on_e}
     ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args)
     return out, lse
 
 
-def cp_lse_ag_out_rs(
+def _cp_lse_common(
     cp_attn_out: torch.Tensor,
     cp_attn_lse: torch.Tensor,
     cp_group: GroupCoordinator,
-    ctx: CPTritonContext = None,
-    return_lse=False,
+    ctx: CPTritonContext | None = None,
+    is_lse_base_on_e=True,
 ):
     """
     cp_attn_out: [ B, H, D ]
@@ -194,7 +203,31 @@ def cp_lse_ag_out_rs(
 
     cp_attn_lse = cp_attn_lse.contiguous()
     lses = cp_group.all_gather(cp_attn_lse, dim=0).view_as(lses)
-    out, lse = correct_attn_out(cp_attn_out, lses, cp_group.rank_in_group, ctx)
+    out, lse = correct_attn_out(
+        cp_attn_out,
+        lses,
+        cp_group.rank_in_group,
+        ctx,
+        is_lse_base_on_e=is_lse_base_on_e,
+    )
+    return out, lse
+
+
+def cp_lse_ag_out_rs(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext | None = None,
+    return_lse: bool = False,
+    is_lse_base_on_e=True,
+):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    out, lse = _cp_lse_common(
+        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
+    )
     out = cp_group.reduce_scatter(out, dim=1)
 
     if return_lse:
@@ -205,6 +238,28 @@ def cp_lse_ag_out_rs(
     return out
 
 
+def cp_lse_ag_out_ar(
+    cp_attn_out: torch.Tensor,
+    cp_attn_lse: torch.Tensor,
+    cp_group: GroupCoordinator,
+    ctx: CPTritonContext | None = None,
+    return_lse: bool = False,
+    is_lse_base_on_e=True,
+):
+    """
+    cp_attn_out: [ B, H, D ]
+    cp_attn_lse: [ B, H ]
+    """
+    out, lse = _cp_lse_common(
+        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
+    )
+    out = cp_group.all_reduce(out)
+
+    if return_lse:
+        return out, lse
+    return out
+
+
 @triton.jit
 def _pack_seq_kernel(
     x_ptr,  # [N, D]
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 8e010ffba32e..4aa4bcf5bbd3 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -1,58 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from dataclasses import dataclass
 
 import torch
 
 from vllm.platforms import current_platform
-from vllm.triton_utils import HAS_TRITON
 
 if current_platform.is_cuda_alike():
     from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
     from vllm._ipex_ops import ipex_ops as ops
 
-if HAS_TRITON:
-    from vllm.attention.ops.prefix_prefill import context_attention_fwd
-
-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
-
-
-@dataclass
-class PagedAttentionMetadata:
-    """Metadata for PagedAttention."""
-
-    # (batch_size,). The length of sequences (entire tokens seen so far) per
-    # sequence.
-    seq_lens_tensor: torch.Tensor | None
-    # Maximum sequence length in the batch. 0 if it is prefill-only batch.
-    max_decode_seq_len: int
-    # (batch_size, max_blocks_per_seq).
-    # Block addresses per sequence. (Seq id -> list of physical block)
-    # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks
-    # in the kv cache. Each block can contain up to block_size tokens.
-    # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph
-    # captured.
-    block_tables: torch.Tensor | None
-
 
 class PagedAttention:
-    @staticmethod
-    def get_supported_head_sizes() -> list[int]:
-        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        return (2, num_blocks, block_size * num_kv_heads * head_size)
-
     @staticmethod
     def split_kv_cache(
         kv_cache: torch.Tensor,
@@ -89,174 +49,3 @@ def write_to_paged_cache(
             k_scale,
             v_scale,
         )
-
-    @staticmethod
-    def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        seq_lens: torch.Tensor,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        tp_rank: int = 0,
-        blocksparse_local_blocks: int = 0,
-        blocksparse_vert_stride: int = 0,
-        blocksparse_block_size: int = 64,
-        blocksparse_head_sliding_step: int = 0,
-    ) -> torch.Tensor:
-        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
-            # use blocksparse paged attention
-            block_size = value_cache.size(-1)
-            assert (
-                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
-            ), (
-                f"{blocksparse_block_size=} needs to be a multiple of"
-                f"{block_size=} used in block_tables."
-            )
-
-        output = torch.empty_like(query)
-        block_size = value_cache.shape[3]
-        num_seqs, num_heads, head_size = query.shape
-        max_num_partitions = (max_seq_len + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-        # NOTE(woosuk): We use a simple heuristic to decide whether to use
-        # PagedAttention V1 or V2. If the number of partitions is 1, we use
-        # V1 to avoid the overhead of reduction. Also, if the number of
-        # sequences or heads is large, we use V1 since there is enough work
-        # to parallelize.
-        # TODO(woosuk): Tune this heuristic.
-        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-        use_v1 = max_seq_len <= 8192 and (
-            max_num_partitions == 1 or num_seqs * num_heads > 512
-        )
-
-        if use_v1:
-            # Run PagedAttention V1.
-            ops.paged_attention_v1(
-                output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                seq_lens,
-                block_size,
-                max_seq_len,
-                alibi_slopes,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-                tp_rank,
-                blocksparse_local_blocks,
-                blocksparse_vert_stride,
-                blocksparse_block_size,
-                blocksparse_head_sliding_step,
-            )
-        else:
-            # Run PagedAttention V2.
-            assert _PARTITION_SIZE % block_size == 0
-            tmp_output = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions, head_size),
-                dtype=output.dtype,
-                device=output.device,
-            )
-            exp_sums = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions),
-                dtype=torch.float32,
-                device=output.device,
-            )
-            max_logits = torch.empty_like(exp_sums)
-            ops.paged_attention_v2(
-                output,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                num_kv_heads,
-                scale,
-                block_tables,
-                seq_lens,
-                block_size,
-                max_seq_len,
-                alibi_slopes,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-                tp_rank,
-                blocksparse_local_blocks,
-                blocksparse_vert_stride,
-                blocksparse_block_size,
-                blocksparse_head_sliding_step,
-            )
-        return output
-
-    @staticmethod
-    def forward_prefix(
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache_dtype: str,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        query_start_loc: torch.Tensor,
-        seq_lens_tensor: torch.Tensor,
-        max_query_len: int,
-        alibi_slopes: torch.Tensor | None,
-        sliding_window: int | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-    ) -> torch.Tensor:
-        output = torch.empty_like(query)
-        max_seq_len = None
-        context_attention_fwd(
-            query,
-            key,
-            value,
-            output,
-            kv_cache_dtype,
-            key_cache,
-            value_cache,
-            block_tables,
-            # query_start_loc is (batch_size + 1,)
-            query_start_loc,
-            seq_lens_tensor,
-            max_seq_len,
-            max_query_len,
-            k_scale,
-            v_scale,
-            alibi_slopes,
-            sliding_window,
-        )
-        return output
-
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dst: torch.Tensor,
-    ) -> None:
-        src_key_cache = src_kv_cache[0]
-        dst_key_cache = dst_kv_cache[0]
-        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-
-        src_value_cache = src_kv_cache[1]
-        dst_value_cache = dst_kv_cache[1]
-        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
-
-    @staticmethod
-    def copy_blocks(
-        kv_caches: list[torch.Tensor],
-        src_to_dists: torch.Tensor,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        ops.copy_blocks(key_caches, value_caches, src_to_dists)
diff --git a/vllm/attention/ops/rocm_aiter_mla_sparse.py b/vllm/attention/ops/rocm_aiter_mla_sparse.py
new file mode 100644
index 000000000000..080e92ecc940
--- /dev/null
+++ b/vllm/attention/ops/rocm_aiter_mla_sparse.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib
+from functools import lru_cache
+
+import torch
+
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+logger = init_logger(__name__)
+
+
+# Take from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L84
+def fp8_mqa_logits_torch(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+    kv, scale = kv
+    seq_len_kv = kv.shape[0]
+    k = kv.to(torch.bfloat16)
+    q = q.to(torch.bfloat16)
+
+    mask_lo = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] >= cu_seqlen_ks[:, None]
+    )
+    mask_hi = (
+        torch.arange(0, seq_len_kv, device="cuda")[None, :] < cu_seqlen_ke[:, None]
+    )
+    mask = mask_lo & mask_hi
+
+    score = torch.einsum("mhd,nd->hmn", q, k).float() * scale
+    logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
+    logits = logits.masked_fill(~mask, float("-inf"))
+
+    return logits
+
+
+def rocm_fp8_mqa_logits(
+    q: torch.Tensor,
+    kv: tuple[torch.Tensor, torch.Tensor],
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits for a single sequence without KV paging.
+
+    Args:
+        q: Query tensor of shape [M, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
+            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
+            [N, 1]) with dtype `torch.float32`.
+        weights: weights of shape [M, H], dtype `torch.float32`.
+        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
+            shape [M], dtype int32.
+        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
+            shape [M], dtype int32.
+
+    Returns:
+        Logits tensor of shape [M, N], dtype `torch.float32`.
+    """
+
+    # TODO(ganyi): Temporarily workaround, will remove the module check and reference
+    # path after aiter merge this kernel into main
+    @lru_cache
+    def has_mqa_logits_module():
+        return importlib.util.find_spec("aiter.ops.triton.fp8_mqa_logits") is not None
+
+    if rocm_aiter_ops.is_enabled() and has_mqa_logits_module():
+        from aiter.ops.triton.fp8_mqa_logits import fp8_mqa_logits
+
+        kv, scale = kv
+        return fp8_mqa_logits(q, kv, scale, weights, cu_seqlen_ks, cu_seqlen_ke)
+    else:
+        return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)
+
+
+# Taken from https://github.com/deepseek-ai/DeepGEMM/blob/main/tests/test_attention.py#L156
+def fp8_paged_mqa_logits_torch(
+    q: torch.Tensor,
+    kv_cache: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    max_model_len: int,
+):
+    from vllm.utils.math_utils import cdiv
+
+    fp8_dtype = current_platform.fp8_dtype()
+    batch_size, next_n, _, dim = q.size()
+    kv_cache, scale = kv_cache[..., :dim], kv_cache[..., dim:]
+    scale = scale.contiguous().view(torch.float)
+    q = q.float()
+    kv_cache = kv_cache.view(fp8_dtype).float() * scale
+    num_block, block_size, _, dim = kv_cache.size()
+    logits = torch.full(
+        [batch_size * next_n, max_model_len],
+        float("-inf"),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    context_lens = context_lens.tolist()
+    for i in range(batch_size):
+        context_len = context_lens[i]
+        q_offsets = torch.arange(context_len - next_n, context_len, device="cuda")
+        weight_slice = (
+            weights[i * next_n : (i + 1) * next_n, :].transpose(0, 1).contiguous()
+        )
+        for block_rk in range(cdiv(context_len, block_size)):
+            block_idx = block_tables[i][block_rk]
+            qx, kx = q[i], kv_cache[block_idx]
+            k_offsets = torch.arange(
+                block_rk * block_size, (block_rk + 1) * block_size, device="cuda"
+            )
+            mask = (k_offsets[None, :] < context_len) & (
+                k_offsets[None, :] <= q_offsets[:, None]
+            )
+            s = torch.where(
+                mask[None, :, :],
+                (qx.transpose(0, 1) @ kx.transpose(0, 1).transpose(1, 2)).to(
+                    logits.dtype
+                ),
+                float("-inf"),
+            )
+            s = torch.relu(s) * weight_slice[..., None]
+            s = s.sum(dim=0)
+            logits[
+                i * next_n : (i + 1) * next_n,
+                block_rk * block_size : (block_rk + 1) * block_size,
+            ] = torch.where(k_offsets[None, :] <= q_offsets[:, None], s, float("-inf"))
+    return logits
+
+
+def rocm_fp8_paged_mqa_logits(
+    q_fp8: torch.Tensor,
+    kv_cache_fp8: torch.Tensor,
+    weights: torch.Tensor,
+    context_lens: torch.Tensor,
+    block_tables: torch.Tensor,
+    schedule_metadata: torch.Tensor,
+    max_model_len: int,
+) -> torch.Tensor:
+    """Compute FP8 MQA logits using paged KV-cache.
+
+    Args:
+        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
+            `torch.float8_e4m3fn` by caller.
+        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
+            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
+            4 bytes per (block,pos) store the `float` dequant scale.
+        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
+        context_lens: Tensor of shape [B], dtype int32; effective context length
+            for each batch element.
+        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
+            block indices to physical blocks in the paged cache.
+        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
+            used to distribute work across SMs.
+        max_model_len: Maximum sequence length used to size the logits output.
+
+    Returns:
+        Logits tensor of shape [B * next_n, max_model_len], dtype
+        `torch.float32`.
+    """
+
+    if rocm_aiter_ops.is_enabled():
+        from aiter.ops.triton.pa_mqa_logits import deepgemm_fp8_paged_mqa_logits_stage1
+
+        batch_size, next_n, heads, _ = q_fp8.shape
+        out_qk = torch.full(
+            (heads, batch_size * next_n, max_model_len),
+            float("-inf"),
+            device="cuda",
+            dtype=torch.float32,
+        )
+        deepgemm_fp8_paged_mqa_logits_stage1(
+            q_fp8,
+            kv_cache_fp8,
+            weights,
+            out_qk,
+            context_lens,
+            block_tables,
+            max_model_len,
+        )
+        return out_qk.sum(dim=0)
+    else:
+        return fp8_paged_mqa_logits_torch(
+            q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len
+        )
diff --git a/vllm/attention/ops/rocm_aiter_paged_attn.py b/vllm/attention/ops/rocm_aiter_paged_attn.py
deleted file mode 100644
index bcd1e2cd5644..000000000000
--- a/vllm/attention/ops/rocm_aiter_paged_attn.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import aiter as rocm_aiter
-import torch
-
-from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.platforms import current_platform
-from vllm.utils.math_utils import cdiv
-
-FP8_DTYPE = current_platform.fp8_dtype()
-
-
-class AITERPagedAttention(PagedAttention):
-    @staticmethod
-    def write_to_paged_cache(
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        slot_mapping: torch.Tensor,
-        kv_cache_dtype: str,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-    ) -> None:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            PagedAttention.write_to_paged_cache(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                slot_mapping,
-                kv_cache_dtype,
-                k_scale,
-                v_scale,
-            )
-        else:
-            kv_cache_torch_dtype = FP8_DTYPE if "fp8" in kv_cache_dtype else torch.int8
-            key_cache = key_cache.view(kv_cache_torch_dtype)
-            value_cache = value_cache.view(kv_cache_torch_dtype)
-
-            rocm_aiter.reshape_and_cache_with_pertoken_quant(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                k_scale,
-                v_scale,
-                slot_mapping.flatten(),
-                True,
-            )
-
-    @staticmethod
-    def forward_decode(
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        block_tables: torch.Tensor,
-        seq_lens: torch.Tensor,
-        max_seq_len: int,
-        kv_cache_dtype: str,
-        num_kv_heads: int,
-        scale: float,
-        alibi_slopes: torch.Tensor | None,
-        k_scale: torch.Tensor,
-        v_scale: torch.Tensor,
-        tp_rank: int = 0,
-        blocksparse_local_blocks: int = 0,
-        blocksparse_vert_stride: int = 0,
-        blocksparse_block_size: int = 64,
-        blocksparse_head_sliding_step: int = 0,
-    ) -> torch.Tensor:
-        if kv_cache_dtype not in ["int8", "fp8", "fp8_e4m3"]:
-            return PagedAttention.forward_decode(
-                query=query,
-                key_cache=key_cache,
-                value_cache=value_cache,
-                block_tables=block_tables,
-                seq_lens=seq_lens,
-                max_seq_len=max_seq_len,
-                kv_cache_dtype=kv_cache_dtype,
-                num_kv_heads=num_kv_heads,
-                scale=scale,
-                alibi_slopes=alibi_slopes,
-                k_scale=k_scale,
-                v_scale=v_scale,
-                tp_rank=tp_rank,
-                blocksparse_local_blocks=blocksparse_local_blocks,
-                blocksparse_vert_stride=blocksparse_vert_stride,
-                blocksparse_block_size=blocksparse_block_size,
-                blocksparse_head_sliding_step=blocksparse_head_sliding_step,
-            )
-
-        if "fp8" in kv_cache_dtype:
-            key_cache = key_cache.view(current_platform.fp8_dtype())
-            value_cache = value_cache.view(current_platform.fp8_dtype())
-
-        if blocksparse_vert_stride is not None and blocksparse_vert_stride > 1:
-            # use blocksparse paged attention
-            block_size = value_cache.size(-1)
-            assert (
-                blocksparse_block_size > 0 and blocksparse_block_size % block_size == 0
-            ), (
-                f"{blocksparse_block_size=} needs to be a multiple of"
-                f"{block_size=} used in block_tables."
-            )
-
-        output = torch.empty_like(query)
-        block_size = value_cache.shape[3]
-        max_num_blocks_per_seq = cdiv(max_seq_len, block_size)
-
-        rocm_aiter.pa_fwd_asm(
-            query,
-            key_cache,
-            value_cache,
-            block_tables,
-            seq_lens,
-            max_num_blocks_per_seq,
-            k_scale,
-            v_scale,
-            output,
-        )
-        return output
diff --git a/vllm/attention/ops/triton_merge_attn_states.py b/vllm/attention/ops/triton_merge_attn_states.py
index 3c87a24afd9c..74e4d778ded8 100644
--- a/vllm/attention/ops/triton_merge_attn_states.py
+++ b/vllm/attention/ops/triton_merge_attn_states.py
@@ -20,7 +20,11 @@ def merge_attn_states(
     num_query_heads = output.shape[1]
     head_size = output.shape[2]
     padded_head_size = triton.next_power_of_2(head_size)
-
+    # We assume the output stride on num_head is not always as same as the
+    # `suffix_output` and `prefix_output`, as them might be padded by the attention
+    # backend.
+    prefix_head_stride = prefix_output.stride(1)
+    output_head_stride = output.stride(1)
     # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead.
     merge_attn_states_kernel[(num_tokens, num_query_heads)](
         output,
@@ -29,6 +33,8 @@ def merge_attn_states(
         prefix_lse,
         suffix_output,
         suffix_lse,
+        prefix_head_stride,
+        output_head_stride,
         head_size,
         padded_head_size,
         output_lse is not None,
@@ -43,6 +49,8 @@ def merge_attn_states_kernel(
     prefix_lse,  # [NUM_HEADS, NUM_TOKENS]
     suffix_output,  # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE]
     suffix_lse,  # [NUM_HEADS, NUM_TOKENS]
+    prefix_head_stride,
+    output_head_stride,
     HEAD_SIZE: tl.constexpr,
     PADDED_HEAD_SIZE: tl.constexpr,
     OUTPUT_LSE: tl.constexpr,
@@ -79,15 +87,15 @@ def merge_attn_states_kernel(
     head_mask = head_arange < HEAD_SIZE
     p_out = tl.load(
         prefix_output
-        + token_idx * num_heads * HEAD_SIZE
-        + head_idx * HEAD_SIZE
+        + token_idx * num_heads * prefix_head_stride
+        + head_idx * prefix_head_stride
         + head_arange,
         mask=head_mask,
     )
     s_out = tl.load(
         suffix_output
-        + token_idx * num_heads * HEAD_SIZE
-        + head_idx * HEAD_SIZE
+        + token_idx * num_heads * prefix_head_stride
+        + head_idx * prefix_head_stride
         + head_arange,
         mask=head_mask,
     )
@@ -99,7 +107,10 @@ def merge_attn_states_kernel(
     s_scale = s_se / out_se
     out = p_out * p_scale + s_out * s_scale
     tl.store(
-        output + token_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE + head_arange,
+        output
+        + token_idx * num_heads * output_head_stride
+        + head_idx * output_head_stride
+        + head_arange,
         out,
         mask=head_mask,
     )
diff --git a/vllm/attention/ops/triton_reshape_and_cache_flash.py b/vllm/attention/ops/triton_reshape_and_cache_flash.py
index bbcd560ad56e..5d2ba154ae01 100644
--- a/vllm/attention/ops/triton_reshape_and_cache_flash.py
+++ b/vllm/attention/ops/triton_reshape_and_cache_flash.py
@@ -97,7 +97,6 @@ def triton_reshape_and_cache_flash(
     k_scale: torch.Tensor,  # float32
     v_scale: torch.Tensor,  # float32
 ):
-    num_tokens = key.shape[0]
     num_heads = key.shape[1]
     head_size = key.shape[2]
     block_size = key_cache.shape[1]
@@ -155,7 +154,10 @@ def triton_reshape_and_cache_flash(
 
     # TODO(ngl): maybe replace with static launch grid to avoid overhead if
     #   using cudagraphs
-    grid = lambda meta: (int(num_tokens), triton.cdiv(n, meta["TILE_SIZE"]))
+    grid = lambda meta: (
+        slot_mapping.shape[0],
+        triton.cdiv(n, meta["TILE_SIZE"]),
+    )
 
     reshape_and_cache_kernel_flash[grid](
         key_ptr=key,
diff --git a/vllm/attention/ops/vit_attn_wrappers.py b/vllm/attention/ops/vit_attn_wrappers.py
index 06a9f7cd8226..d9f15f1e4285 100644
--- a/vllm/attention/ops/vit_attn_wrappers.py
+++ b/vllm/attention/ops/vit_attn_wrappers.py
@@ -3,7 +3,7 @@
 """
 This file contains ops for ViT attention to be compatible with torch.compile
 as there are operations here not supported by torch.compile (for instance,
-`to_list` in xformers attn, or `.item()` in flash attention)
+`.item()` in flash attention)
 
 Using these ops and wrapping vision blocks with `torch.compile` can speed up
 throughput in vision models by ~5% relative on H100, and improve token
@@ -19,42 +19,6 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 
 
-def xformers_attn_seqlens_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-    attn_bias = BlockDiagonalMask.from_seqlens(
-        q_seqlen=seqlens.tolist(), kv_seqlen=None, device=q.device
-    )
-    context_layer = xops.memory_efficient_attention_forward(
-        q, k, v, attn_bias=attn_bias, p=0, scale=None
-    )
-    context_layer = einops.rearrange(context_layer, "b s h d -> s b (h d)").contiguous()
-    return context_layer
-
-
-def xformers_attn_seqlens_wrapper_fake(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    b, s, h, d = q.shape
-    return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
-
-
-direct_register_custom_op(
-    op_name="xformers_attn_seqlens_wrapper",
-    op_func=xformers_attn_seqlens_wrapper,
-    fake_impl=xformers_attn_seqlens_wrapper_fake,
-)
-
-
-def vit_xformers_attn_wrapper(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, seqlens: torch.Tensor
-) -> torch.Tensor:
-    return torch.ops.vllm.xformers_attn_seqlens_wrapper(q, k, v, seqlens)
-
-
 def flash_attn_maxseqlen_wrapper(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -63,15 +27,11 @@ def flash_attn_maxseqlen_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     if is_rocm_aiter:
         from aiter import flash_attn_varlen_func
     else:
-        if use_upstream_fa:
-            from flash_attn import flash_attn_varlen_func
-        else:
-            from vllm.attention.utils.fa_utils import flash_attn_varlen_func
+        from vllm.attention.utils.fa_utils import flash_attn_varlen_func
     q, k, v = (einops.rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
     output = flash_attn_varlen_func(
         q,
@@ -98,7 +58,6 @@ def flash_attn_maxseqlen_wrapper_fake(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     b, s, h, d = q.shape
     return torch.empty((s, b, h * d), dtype=q.dtype, device=q.device)
@@ -119,10 +78,9 @@ def vit_flash_attn_wrapper(
     max_seqlen: torch.Tensor,
     batch_size: int,
     is_rocm_aiter: bool,
-    use_upstream_fa: bool,
 ) -> torch.Tensor:
     return torch.ops.vllm.flash_attn_maxseqlen_wrapper(
-        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter, use_upstream_fa
+        q, k, v, cu_seqlens, max_seqlen, batch_size, is_rocm_aiter
     )
 
 
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 262cdf0e575b..a7190df3c4f1 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -12,10 +12,13 @@
 
 import vllm.envs as envs
 from vllm.attention.backends.abstract import AttentionBackend
-from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.backends.registry import (
+    MAMBA_TYPE_TO_BACKEND_MAP,
+    AttentionBackendEnum,
+    MambaAttentionBackendEnum,
+)
 from vllm.config.cache import CacheDType
 from vllm.logger import init_logger
-from vllm.utils import STR_BACKEND_ENV_VAR
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 logger = init_logger(__name__)
@@ -31,8 +34,15 @@ def get_env_variable_attn_backend() -> AttentionBackendEnum | None:
     * AttentionBackendEnum value if an override is specified
     * None otherwise
     """
-    backend_name = os.environ.get(STR_BACKEND_ENV_VAR)
-    return None if backend_name is None else AttentionBackendEnum[backend_name]
+    backend_name = os.environ.get("VLLM_ATTENTION_BACKEND")
+    if backend_name is None:
+        return None
+    if backend_name == "XFORMERS":
+        raise ValueError(
+            "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+            "details). Please select a supported attention backend."
+        )
+    return AttentionBackendEnum[backend_name]
 
 
 # Global state allows a particular choice of backend
@@ -76,6 +86,7 @@ def get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
 
@@ -94,6 +105,7 @@ def get_attn_backend(
         use_mla=use_mla,
         has_sink=has_sink,
         use_sparse=use_sparse,
+        attn_type=attn_type,
     )
 
 
@@ -106,6 +118,7 @@ def _cached_get_attn_backend(
     use_mla: bool = False,
     has_sink: bool = False,
     use_sparse: bool = False,
+    attn_type: str | None = None,
 ) -> type[AttentionBackend]:
     # Check whether a particular choice of backend was
     # previously forced.
@@ -125,10 +138,10 @@ def _cached_get_attn_backend(
             if backend_by_env_var.endswith("_VLLM_V1"):
                 logger.warning(
                     "The suffix '_VLLM_V1' in the environment variable "
-                    "%s is no longer necessary as V0 backends have been "
-                    "deprecated. Please remove this suffix from your "
+                    "VLLM_ATTENTION_BACKEND is no longer necessary as "
+                    "V0 backends have been deprecated. "
+                    "Please remove this suffix from your "
                     "environment variable setting.",
-                    STR_BACKEND_ENV_VAR,
                 )
                 backend_by_env_var = backend_by_env_var.removesuffix("_VLLM_V1")
             try:
@@ -159,6 +172,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            attn_type,
         )
     else:
         attention_cls = current_platform.get_attn_backend_cls(
@@ -170,6 +184,7 @@ def _cached_get_attn_backend(
             use_mla,
             has_sink,
             use_sparse,
+            attn_type,
         )
     if not attention_cls:
         raise ValueError(
@@ -192,6 +207,33 @@ def _cached_get_attn_backend(
     return backend
 
 
+def get_mamba_attn_backend(
+    mamba_type: str,
+) -> type[AttentionBackend]:
+    """Select which mamba attention backend to use and lazily import it."""
+    return _cached_get_mamba_attn_backend(mamba_type)
+
+
+@cache
+def _cached_get_mamba_attn_backend(
+    mamba_type: str,
+) -> type[AttentionBackend]:
+    assert mamba_type and isinstance(mamba_type, str)
+
+    selected_backend = None
+    try:
+        backend_name = MAMBA_TYPE_TO_BACKEND_MAP[mamba_type]
+        selected_backend = MambaAttentionBackendEnum[backend_name]
+    except KeyError as e:
+        raise ValueError(
+            f"Invalid mamba attention backend type: '{backend_name}'. Valid "
+            f"backends are: {list(MambaAttentionBackendEnum.__members__.keys())}"
+        ) from e
+
+    mamba_attn_backend = selected_backend.get_class()
+    return mamba_attn_backend
+
+
 @contextmanager
 def global_force_attn_backend_context_manager(
     attn_backend: AttentionBackendEnum,
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
index adb9b08a6573..8a46587473e4 100644
--- a/vllm/attention/utils/fa_utils.py
+++ b/vllm/attention/utils/fa_utils.py
@@ -18,6 +18,14 @@
     reshape_and_cache_flash = ops.reshape_and_cache_flash
     flash_attn_varlen_func = ops.flash_attn_varlen_func
     get_scheduler_metadata = ops.get_scheduler_metadata
+elif current_platform.is_rocm():
+    try:
+        from flash_attn import flash_attn_varlen_func  # noqa: F401
+    except ImportError as e:
+        raise ImportError(
+            "Rocm platform requires upstream flash-attn "
+            "to be installed. Please install flash-attn first."
+        ) from e
 
 
 def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 5411ecbb27b2..ec9b0fd6e969 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -39,7 +39,7 @@
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.image import convert_image_mode
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
@@ -293,7 +293,7 @@ def lora_path_on_disk(lora_path: str) -> str:
 
 
 # Global cache for LoRA tokenizers.
-lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+lora_tokenizer_cache: dict[int, TokenizerLike] = {}
 
 
 def process_image(image: Any) -> Mapping[str, Any]:
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
index 0e9b0fbe2c02..519303c0bfa0 100644
--- a/vllm/benchmarks/serve.py
+++ b/vllm/benchmarks/serve.py
@@ -49,6 +49,7 @@
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils.gc_utils import freeze_gc_heap
+from vllm.utils.network_utils import join_host_port
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -1004,7 +1005,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         help="Key-value pairs (e.g, --header x-additional-info=0.3.3) "
         "for headers to be passed with each request. These headers override "
         "per backend constants and values set via environment variable, and "
-        "will be overriden by other arguments (such as request ids).",
+        "will be overridden by other arguments (such as request ids).",
     )
     parser.add_argument(
         "--max-concurrency",
@@ -1137,7 +1138,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "--percentile-metrics",
         type=str,
         default=None,
-        help="Comma-separated list of selected metrics to report percentils. "
+        help="Comma-separated list of selected metrics to report percentiles. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
         'If not specified, defaults to "ttft,tpot,itl" for generative models '
@@ -1333,8 +1334,9 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         api_url = f"{args.base_url}{args.endpoint}"
         base_url = f"{args.base_url}"
     else:
-        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
-        base_url = f"http://{args.host}:{args.port}"
+        host_port = join_host_port(args.host, args.port)
+        api_url = f"http://{host_port}{args.endpoint}"
+        base_url = f"http://{host_port}"
 
     # Headers
     headers = None
diff --git a/vllm/benchmarks/sweep/cli.py b/vllm/benchmarks/sweep/cli.py
index 108cd7569086..e74e0e2c181c 100644
--- a/vllm/benchmarks/sweep/cli.py
+++ b/vllm/benchmarks/sweep/cli.py
@@ -6,6 +6,8 @@
 
 from .plot import SweepPlotArgs
 from .plot import main as plot_main
+from .plot_pareto import SweepPlotParetoArgs
+from .plot_pareto import main as plot_pareto_main
 from .serve import SweepServeArgs
 from .serve import main as serve_main
 from .serve_sla import SweepServeSLAArgs
@@ -15,6 +17,7 @@
     (SweepServeArgs, serve_main),
     (SweepServeSLAArgs, serve_sla_main),
     (SweepPlotArgs, plot_main),
+    (SweepPlotParetoArgs, plot_pareto_main),
 )
 
 
diff --git a/vllm/benchmarks/sweep/plot_pareto.py b/vllm/benchmarks/sweep/plot_pareto.py
new file mode 100644
index 000000000000..70472552b5cd
--- /dev/null
+++ b/vllm/benchmarks/sweep/plot_pareto.py
@@ -0,0 +1,393 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import math
+from concurrent.futures import ProcessPoolExecutor
+from dataclasses import dataclass
+from functools import partial
+from pathlib import Path
+from typing import ClassVar
+
+from vllm.utils.collection_utils import full_groupby
+from vllm.utils.import_utils import PlaceholderModule
+
+from .plot import DummyExecutor, _json_load_bytes
+from .utils import sanitize_filename
+
+try:
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    import seaborn as sns
+except ImportError:
+    plt = PlaceholderModule("matplotlib").placeholder_attr("pyplot")
+    pd = PlaceholderModule("pandas")
+    sns = PlaceholderModule("seaborn")
+
+
+def _first_present(run_data: dict[str, object], keys: list[str]):
+    for key in keys:
+        for candidate in {key, key.replace("_", "-"), key.replace("-", "_")}:
+            if candidate in run_data:
+                return run_data[candidate]
+    return None
+
+
+def _get_numeric(
+    run_data: dict[str, object],
+    keys: list[str],
+    *,
+    allow_zero: bool = True,
+) -> float | None:
+    value = _first_present(run_data, keys)
+    if value is None:
+        return None
+
+    try:
+        numeric = float(value)
+    except (TypeError, ValueError) as exc:
+        raise ValueError(
+            f"Expected numeric value for one of {keys}, "
+            f"but found {value!r} in {run_data=}"
+        ) from exc
+
+    if not allow_zero and numeric == 0:
+        return None
+
+    return numeric
+
+
+def _infer_user_count(
+    run_data: dict[str, object],
+    user_count_var: str | None,
+) -> float | None:
+    candidates = [user_count_var] if user_count_var else []
+    candidates.extend(["request_rate"])
+    user_count = _get_numeric(run_data, candidates, allow_zero=False)
+    if user_count is not None:
+        return user_count
+
+    # Fallback to the observed peak if configured value is missing.
+    return _get_numeric(run_data, ["max_concurrent_requests"], allow_zero=False)
+
+
+def _infer_gpu_count(
+    run_data: dict[str, object],
+    gpu_count_var: str | None,
+) -> float:
+    direct_candidates = [gpu_count_var] if gpu_count_var else []
+    direct_gpu_count = _get_numeric(run_data, direct_candidates, allow_zero=False)
+    if direct_gpu_count:
+        return direct_gpu_count
+
+    tp_size = _get_numeric(run_data, ["tensor_parallel_size", "tp"])
+    pp_size = _get_numeric(run_data, ["pipeline_parallel_size", "pp"])
+    dp_size = _get_numeric(run_data, ["data_parallel_size", "dp"])
+    world_size = 1.0
+    if tp_size:
+        world_size *= tp_size
+    if pp_size:
+        world_size *= pp_size
+    if dp_size:
+        world_size *= dp_size
+
+    return world_size
+
+
+def _get_throughput(
+    run_data: dict[str, object],
+    throughput_var: str,
+) -> float:
+    throughput = _get_numeric(run_data, [throughput_var])
+    if throughput is None:
+        raise ValueError(
+            f"Cannot find throughput metric {throughput_var!r} in run data. "
+            f"Available keys: {sorted(run_data)}"
+        )
+
+    return throughput
+
+
+def _prepare_records(
+    all_data: list[dict[str, object]],
+    *,
+    user_count_var: str | None,
+    gpu_count_var: str | None,
+) -> tuple[list[dict[str, object]], int]:
+    prepared = []
+    skipped_missing_users = 0
+
+    for record in all_data:
+        throughput = _get_throughput(record, "output_throughput")
+        user_count = _infer_user_count(record, user_count_var)
+        if user_count is None:
+            skipped_missing_users += 1
+            continue
+
+        gpu_count = _infer_gpu_count(record, gpu_count_var)
+        tokens_per_user = throughput / user_count
+        tokens_per_gpu = throughput / gpu_count
+
+        prepared.append(
+            {
+                **record,
+                "tokens_per_user": tokens_per_user,
+                "tokens_per_gpu": tokens_per_gpu,
+                "user_count_estimate": user_count,
+                "gpu_count": gpu_count,
+            }
+        )
+
+    return prepared, skipped_missing_users
+
+
+def _pareto_frontier(
+    df: "pd.DataFrame",
+    x_col: str,
+    y_col: str,
+    *,
+    epsilon: float = 1e-9,
+) -> "pd.DataFrame":
+    sorted_df = df.sort_values([x_col, y_col], ascending=[False, False])
+    frontier_indices = []
+    best_y = -math.inf
+
+    for idx, row in sorted_df.iterrows():
+        y_val = row[y_col]
+        if y_val >= best_y - epsilon:
+            frontier_indices.append(idx)
+            best_y = max(best_y, y_val)
+
+    return df.loc[frontier_indices]
+
+
+def _get_fig_path(
+    fig_dir: Path,
+    fig_group: tuple[tuple[str, str], ...],
+) -> Path:
+    parts = ["PARETO"]
+    if fig_group:
+        parts.extend(f"{k}={v}" for k, v in fig_group)
+    filename = sanitize_filename("-".join(parts) + ".png")
+    return fig_dir / filename
+
+
+def _plot_fig(
+    fig_dir: Path,
+    fig_group_data: tuple[tuple[tuple[str, str], ...], list[dict[str, object]]],
+    label_by: list[str],
+    *,
+    dry_run: bool,
+):
+    fig_group, fig_data = fig_group_data
+    fig_path = _get_fig_path(fig_dir, fig_group)
+
+    print("[BEGIN FIGURE]")
+    print(f"Group: {dict(fig_group)}")
+    print(f"Output file: {fig_path}")
+
+    if dry_run:
+        print("[END FIGURE]")
+        return
+
+    df = pd.DataFrame.from_records(fig_data)
+    df = df.dropna(subset=["tokens_per_user", "tokens_per_gpu"])
+
+    if df.empty:
+        print("No data points available after filtering; skipping.")
+        print("[END FIGURE]")
+        return
+
+    frontier = _pareto_frontier(df, "tokens_per_user", "tokens_per_gpu")
+    frontier = frontier.sort_values("tokens_per_user")
+
+    fig, ax = plt.subplots()
+    sns.scatterplot(
+        data=df,
+        x="tokens_per_user",
+        y="tokens_per_gpu",
+        color="0.5",
+        alpha=0.6,
+        ax=ax,
+        label="All runs",
+    )
+    sns.lineplot(
+        data=frontier,
+        x="tokens_per_user",
+        y="tokens_per_gpu",
+        marker="o",
+        ax=ax,
+        label="Pareto frontier",
+    )
+
+    if label_by:
+        for _, row in frontier.iterrows():
+            label_parts = []
+            for key in label_by:
+                if key in row:
+                    label_parts.append(f"{key}={row[key]}")
+            if label_parts:
+                ax.text(
+                    row["tokens_per_user"],
+                    row["tokens_per_gpu"],
+                    "\n".join(label_parts),
+                    fontsize=8,
+                )
+
+    ax.set_xlabel("Tokens/s/user")
+    ax.set_ylabel("Tokens/s/GPU")
+    ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.6)
+    fig.tight_layout()
+    fig.savefig(fig_path)
+    plt.close(fig)
+
+    print(
+        f"Plotted {len(df)} points; Pareto frontier size: {len(frontier)}.",
+    )
+    print("[END FIGURE]")
+
+
+def plot_pareto(
+    output_dir: Path,
+    user_count_var: str | None,
+    gpu_count_var: str | None,
+    label_by: list[str],
+    *,
+    dry_run: bool,
+):
+    fig_dir = output_dir / "pareto"
+    raw_data = [
+        run_data
+        for path in output_dir.rglob("**/summary.json")
+        for run_data in _json_load_bytes(path)
+    ]
+
+    if not raw_data:
+        raise ValueError(f"Did not find any parameter sweep results under {output_dir}")
+
+    fig_dir.mkdir(parents=True, exist_ok=True)
+
+    prepared_data, skipped_missing_users = _prepare_records(
+        raw_data,
+        user_count_var=user_count_var,
+        gpu_count_var=gpu_count_var,
+    )
+
+    if skipped_missing_users:
+        print(
+            f"Skipped {skipped_missing_users} runs without a user count "
+            "(`max_concurrency` or `max_concurrent_requests`).",
+        )
+
+    if not prepared_data:
+        raise ValueError(
+            "No data points with both throughput and user count available "
+            "to plot Pareto frontier.",
+        )
+
+    fig_groups = full_groupby(
+        prepared_data,
+        key=lambda item: tuple(),
+    )
+
+    with DummyExecutor() if len(fig_groups) <= 1 else ProcessPoolExecutor() as executor:
+        all(
+            executor.map(
+                partial(
+                    _plot_fig,
+                    fig_dir,
+                    label_by=label_by,
+                    dry_run=dry_run,
+                ),
+                fig_groups,
+            )
+        )
+
+
+@dataclass
+class SweepPlotParetoArgs:
+    output_dir: Path
+    user_count_var: str | None
+    gpu_count_var: str | None
+    label_by: list[str]
+    dry_run: bool
+
+    parser_name: ClassVar[str] = "plot_pareto"
+    parser_help: ClassVar[str] = (
+        "Plot Pareto frontier between tokens/s/user and tokens/s/GPU "
+        "from parameter sweep results."
+    )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        output_dir = Path(args.OUTPUT_DIR)
+        if not output_dir.exists():
+            raise ValueError(f"No parameter sweep results under {output_dir}")
+
+        label_by = [] if not args.label_by else args.label_by.split(",")
+
+        return cls(
+            output_dir=output_dir,
+            user_count_var=args.user_count_var,
+            gpu_count_var=args.gpu_count_var,
+            label_by=label_by,
+            dry_run=args.dry_run,
+        )
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser):
+        parser.add_argument(
+            "OUTPUT_DIR",
+            type=str,
+            default="results",
+            help="The directory containing the sweep results to plot.",
+        )
+        parser.add_argument(
+            "--user-count-var",
+            type=str,
+            default="max_concurrency",
+            help="Result key that stores concurrent user count. "
+            "Falls back to max_concurrent_requests if missing.",
+        )
+        parser.add_argument(
+            "--gpu-count-var",
+            type=str,
+            default=None,
+            help="Result key that stores GPU count. "
+            "If not provided, falls back to num_gpus/gpu_count "
+            "or tensor_parallel_size * pipeline_parallel_size.",
+        )
+        parser.add_argument(
+            "--label-by",
+            type=str,
+            default="max_concurrency,gpu_count",
+            help="Comma-separated list of fields to annotate on Pareto frontier "
+            "points.",
+        )
+        parser.add_argument(
+            "--dry-run",
+            action="store_true",
+            help="If set, prints the figures to plot without drawing them.",
+        )
+
+        return parser
+
+
+def run_main(args: SweepPlotParetoArgs):
+    return plot_pareto(
+        output_dir=args.output_dir,
+        user_count_var=args.user_count_var,
+        gpu_count_var=args.gpu_count_var,
+        label_by=args.label_by,
+        dry_run=args.dry_run,
+    )
+
+
+def main(args: argparse.Namespace):
+    run_main(SweepPlotParetoArgs.from_cli_args(args))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=SweepPlotParetoArgs.parser_help)
+    SweepPlotParetoArgs.add_cli_args(parser)
+
+    main(parser.parse_args())
diff --git a/vllm/benchmarks/sweep/serve.py b/vllm/benchmarks/sweep/serve.py
index 45ac446a7aed..1298e4acbd87 100644
--- a/vllm/benchmarks/sweep/serve.py
+++ b/vllm/benchmarks/sweep/serve.py
@@ -211,6 +211,7 @@ def run_combs(
     output_dir: Path,
     num_runs: int,
     dry_run: bool,
+    links: list[tuple[str, str]],
 ):
     all_data = list[dict[str, object]]()
     for serve_comb in serve_params:
@@ -226,6 +227,14 @@ def run_combs(
             else contextlib.nullcontext()
         ) as server:
             for bench_comb in bench_params:
+                should_run = all(
+                    serve_key in serve_comb
+                    and bench_key in bench_comb
+                    and serve_comb[serve_key] == bench_comb[bench_key]
+                    for serve_key, bench_key in links
+                )
+                if not should_run:
+                    continue
                 base_path = _get_comb_base_path(output_dir, serve_comb, bench_comb)
 
                 comb_data = run_comb(
@@ -262,6 +271,7 @@ class SweepServeArgs:
     num_runs: int
     dry_run: bool
     resume: str | None
+    link_vars: list[tuple[str, str]] | None
 
     parser_name: ClassVar[str] = "serve"
     parser_help: ClassVar[str] = "Run vLLM server benchmark under multiple settings."
@@ -285,7 +295,7 @@ def from_cli_args(cls, args: argparse.Namespace):
         else:
             # i.e.: run bench_cmd without any modification
             bench_params = ParameterSweep.from_records([{}])
-
+        link_vars = cls.parse_link_vars(args.link_vars)
         num_runs = args.num_runs
         if num_runs < 1:
             raise ValueError("`num_runs` should be at least 1.")
@@ -301,6 +311,7 @@ def from_cli_args(cls, args: argparse.Namespace):
             num_runs=num_runs,
             dry_run=args.dry_run,
             resume=args.resume,
+            link_vars=link_vars,
         )
 
     @classmethod
@@ -376,8 +387,28 @@ def add_cli_args(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParse
             "parameter combinations for which there are still no output files.",
         )
 
+        parser.add_argument(
+            "--link-vars",
+            type=str,
+            default="",
+            help=(
+                "Comma-separated list of linked variables between serve and bench, "
+                "e.g. max_num_seqs=max_concurrency,max_model_len=random_input_len"
+            ),
+        )
+
         return parser
 
+    @staticmethod
+    def parse_link_vars(s: str) -> list[tuple[str, str]]:
+        if not s:
+            return []
+        pairs = []
+        for item in s.split(","):
+            a, b = item.split("=")
+            pairs.append((a.strip(), b.strip()))
+        return pairs
+
 
 def run_main(args: SweepServeArgs):
     timestamp = args.resume or datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -397,6 +428,7 @@ def run_main(args: SweepServeArgs):
             output_dir=output_dir,
             num_runs=args.num_runs,
             dry_run=args.dry_run,
+            links=args.link_vars,
         )
     except BaseException as exc:
         raise RuntimeError(
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index be69075f94f0..1773913d0b6c 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -4,11 +4,15 @@
 import ast
 import dataclasses
 import hashlib
+import json
+import operator
 import os
 import pprint
 import time
 from collections.abc import Callable, Sequence
 from contextlib import contextmanager
+from copy import deepcopy
+from functools import partial
 from typing import Any
 
 import torch
@@ -22,7 +26,9 @@
     should_split,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
+from vllm.logging_utils import lazy
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -58,13 +64,14 @@ def make_compiler(compilation_config: CompilationConfig) -> CompilerInterface:
         else:
             logger.debug("Using InductorAdaptor")
             return InductorAdaptor()
-    else:
-        assert compilation_config.backend == "eager", (
-            "Custom backends not supported with CompilationMode.VLLM_COMPILE"
-        )
-
+    elif compilation_config.backend == "eager":
         logger.debug("Using EagerAdaptor")
         return EagerAdaptor()
+    else:
+        logger.debug("Using custom backend: %s", compilation_config.backend)
+        compiler = resolve_obj_by_qualname(current_platform.get_compile_backend())()
+        assert isinstance(compiler, CompilerInterface)
+        return compiler
 
 
 class CompilerManager:
@@ -307,12 +314,24 @@ def split_graph(
 ) -> tuple[fx.GraphModule, list[SplitItem]]:
     # split graph by ops
     subgraph_id = 0
-    node_to_subgraph_id = {}
-    split_op_graphs = []
+    node_to_subgraph_id: dict[fx.Node, int] = {}
+    split_op_graphs: list[int] = []
     for node in graph.graph.nodes:
         if node.op in ("output", "placeholder"):
             continue
 
+        # Check if this is a getitem operation on a node from an earlier subgraph.
+        # If so, assign it to the same subgraph as its input to avoid passing entire
+        # tuple as input to submodules, which is against standalone_compile and
+        # AoTAutograd input requirement.
+        if node.op == "call_function" and node.target == operator.getitem:
+            # Assign this getitem to the same subgraph as its input
+            input_node = node.args[0]
+            if input_node.op != "placeholder":
+                assert input_node in node_to_subgraph_id
+                node_to_subgraph_id[node] = node_to_subgraph_id[input_node]
+                continue
+
         if should_split(node, splitting_ops):
             subgraph_id += 1
             node_to_subgraph_id[node] = subgraph_id
@@ -411,7 +430,7 @@ def call_module(
                 self.vllm_backend.compiler_manager.compile(
                     submod,
                     args,
-                    self.compilation_config.inductor_compile_config,
+                    self.vllm_backend.inductor_config,
                     self.compilation_config,
                     graph_index=index,
                     num_graphs=len(self.compile_submod_names),
@@ -513,6 +532,9 @@ class VllmBackend:
     sym_tensor_indices: list[int]
     input_buffers: list[torch.Tensor]
     compiler_manager: CompilerManager
+    # Copy of CompilationConfig.inductor_compile_config +
+    # an entry for PostGradPassManager
+    inductor_config: dict[str, Any]
 
     def __init__(
         self,
@@ -528,7 +550,10 @@ def __init__(
         self.prefix = prefix or model_tag
 
         # Passes to run on the graph post-grad.
-        self.post_grad_pass_manager = PostGradPassManager()
+        self.pass_manager = resolve_obj_by_qualname(
+            current_platform.get_pass_manager_cls()
+        )()
+        self.pass_key = current_platform.pass_key
 
         self.sym_tensor_indices = []
         self.input_buffers = []
@@ -540,62 +565,75 @@ def __init__(
             self.compilation_config
         )
 
+        # Deepcopy the inductor config to detach the post-grad custom pass
+        # from CompilationConfig.
+        # We want to avoid PostGradPassManager in CompilationConfig because
+        # in future we need PostGradPassManager.uuid() to be executed
+        # only at compile time.
+        self.inductor_config = deepcopy(self.compilation_config.inductor_compile_config)
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
     def configure_post_pass(self):
-        config = self.compilation_config
-        self.post_grad_pass_manager.configure(self.vllm_config)
+        self.pass_manager.configure(self.vllm_config)
 
         # Post-grad custom passes are run using the post_grad_custom_post_pass
         # hook. If a pass for that hook exists, add it to the pass manager.
-        inductor_config = config.inductor_compile_config
-        PASS_KEY = "post_grad_custom_post_pass"
-        if PASS_KEY in inductor_config:
-            if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
-                # PassManager already added to config, make sure it's correct
-                assert (
-                    inductor_config[PASS_KEY].uuid()
-                    == self.post_grad_pass_manager.uuid()
+        if self.pass_key in self.inductor_config:
+            if isinstance(self.inductor_config[self.pass_key], PostGradPassManager):
+                raise ValueError(
+                    "PostGradPassManager can not be kept in CompilationConfig."
                 )
             else:
                 # Config should automatically wrap all inductor passes
-                assert isinstance(inductor_config[PASS_KEY], InductorPass)
-                self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
-        inductor_config[PASS_KEY] = self.post_grad_pass_manager
+                assert isinstance(self.inductor_config[self.pass_key], InductorPass)
+                self.pass_manager.add(self.inductor_config[self.pass_key])
+        self.inductor_config[self.pass_key] = self.pass_manager
 
     def __call__(
         self, graph: fx.GraphModule, example_inputs
     ) -> VllmSerializableFunction:
-        from .caching import _compute_code_hash, compilation_config_hash_factors
-
         vllm_config = self.vllm_config
+        # Minimal hashing here with existing utilities, reused below.
+
+        env_factors = envs.compile_factors()
+        env_hash = hash_factors(env_factors)
+        # Compute config/compiler/code hashes once and reuse
+        config_hash = vllm_config.compute_hash()
+        compiler_hash = self.compiler_manager.compute_hash(vllm_config)
+        forward_code_files = list(sorted(self.compilation_config.traced_files))
+
+        logger.debug(
+            "Traced files (to be considered for compilation cache):\n%s",
+            lazy(lambda: "\n".join(forward_code_files)),
+        )
+        hash_content = []
+        for filepath in forward_code_files:
+            hash_content.append(filepath)
+            if filepath == "<string>":
+                # This means the function was dynamically generated, with
+                # e.g. exec(). We can't actually check these.
+                continue
+            try:
+                with open(filepath) as f:
+                    hash_content.append(f.read())
+            except Exception:
+                logger.warning("Failed to read file %s", filepath)
+                continue
+        code_hash = hashlib.sha256("\n".join(hash_content).encode()).hexdigest()
+        # Clear after consumption
+        self.compilation_config.traced_files.clear()
         if not self.compilation_config.cache_dir:
             # no provided cache dir, generate one based on the known factors
             # that affects the compilation. if none of the factors change,
             # the cache dir will be the same so that we can reuse the compiled
             # graph.
-
-            factors = compilation_config_hash_factors(vllm_config)
-            # 2. factors come from the code files that are traced by Dynamo (
-            #    it mainly summarizes how the model is used in forward pass)
-            code_hash = _compute_code_hash(self.compilation_config.traced_files)
-            self.compilation_config.traced_files.clear()
-            factors.append(code_hash)
-
-            # 3. compiler hash
-            compiler_hash = self.compiler_manager.compute_hash(vllm_config)
-            factors.append(compiler_hash)
-
-            # combine all factors to generate the cache dir
-            hash_key = hashlib.md5(
-                str(factors).encode(), usedforsecurity=False
-            ).hexdigest()[:10]
-
+            factors = [env_hash, config_hash, code_hash, compiler_hash]
+            # Use SHA-256 for cache key hashing to be consistent across
+            # compute_hash functions. Truncate for a short cache dir name.
+            hash_key = hashlib.sha256(str(factors).encode()).hexdigest()[:10]
             cache_dir = os.path.join(
-                envs.VLLM_CACHE_ROOT,
-                "torch_compile_cache",
-                hash_key,
+                envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key
             )
             self.compilation_config.cache_dir = cache_dir
 
@@ -608,9 +646,8 @@ def __call__(
         os.makedirs(local_cache_dir, exist_ok=True)
         self.compilation_config.local_cache_dir = local_cache_dir
 
-        disable_cache = not is_compile_cache_enabled(
-            self.compilation_config.inductor_compile_config
-        )
+        # Honors opt-outs such as CompilationMode.NONE or VLLM_DISABLE_COMPILE_CACHE.
+        disable_cache = not is_compile_cache_enabled(self.inductor_config)
 
         if disable_cache:
             logger.info_once("vLLM's torch.compile cache is disabled.", scope="local")
@@ -625,6 +662,50 @@ def __call__(
             local_cache_dir, disable_cache, self.prefix
         )
 
+        # Reuses existing cache key
+
+        logger.debug(
+            "torch.compile cache factors: env=%s cfg=%s comp=%s code=%s dir=%s",
+            env_hash,
+            config_hash,
+            compiler_hash,
+            code_hash,
+            local_cache_dir,
+        )
+
+        # Persist and log only hash-relevant factors together.
+        try:
+            logger.debug(
+                "Compile env factors (raw):\n%s\nVllm config hash: %s",
+                lazy(partial(pprint.pformat, env_factors, width=120)),
+                config_hash,
+            )
+            meta_path = os.path.join(local_cache_dir, "cache_key_factors.json")
+            if not os.path.exists(meta_path):
+                with open(meta_path, "w") as f:
+                    json.dump(
+                        {
+                            "env": env_factors,  # raw factors used for env_hash
+                            "config_hash": config_hash,
+                            "code_hash": code_hash,
+                            "compiler_hash": compiler_hash,
+                        },
+                        f,
+                        indent=2,
+                        sort_keys=True,
+                    )
+        except Exception:
+            # Best-effort only; metadata write failures are non-fatal.
+            logger.warning(
+                (
+                    "Could not write compile cache metadata at %s; continuing without "
+                    "metadata. Compiled cache remains valid; diagnostics may be "
+                    "limited."
+                ),
+                local_cache_dir,
+                exc_info=True,
+            )
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 16e34c2711e9..ce482572b401 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import inspect
 import os
 import pickle
@@ -12,7 +11,9 @@
 
 import vllm.envs as envs
 from vllm.config import VllmConfig, get_current_vllm_config
+from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 try:
     from torch._dynamo.aot_compile import SerializableCallable
@@ -115,7 +116,8 @@ def optimized_call(*example_inputs):
             the AOT compiled path.
             """
             compile_inputs = [
-                inp or example_inputs[i] for i, inp in enumerate(fn.example_inputs)
+                inp if inp is not None else example_inputs[i]
+                for i, inp in enumerate(fn.example_inputs)
             ]
             with tracing(TracingContext(fake_mode)):
                 fn.optimized_call = vllm_backend(
@@ -138,7 +140,7 @@ def compilation_config_hash_factors(vllm_config: VllmConfig) -> list[str]:
     factors = []
     # 0. factors come from the env, for example, The values of
     # VLLM_PP_LAYER_PARTITION will affect the computation graph.
-    env_hash = envs.compute_hash()
+    env_hash = hash_factors(envs.compile_factors())
     factors.append(env_hash)
 
     # 1. factors come from the vllm_config (it mainly summarizes how the
@@ -158,7 +160,7 @@ def _compute_code_hash_with_content(file_contents: dict[str, str]) -> str:
             # e.g. exec(). We can't actually check these.
             continue
         hash_content.append(content)
-    return hashlib.md5(
+    return safe_hash(
         "\n".join(hash_content).encode(), usedforsecurity=False
     ).hexdigest()
 
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index b0cdb08884a3..7deaba1a99fa 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import contextlib
 import copy
-import hashlib
 import os
 from collections.abc import Callable
 from contextlib import ExitStack
@@ -16,6 +15,7 @@
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig
+from vllm.utils.hashing import safe_hash
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
@@ -197,9 +197,9 @@ def __init__(self, save_format: Literal["binary", "unpacked"]):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def initialize_cache(
@@ -286,9 +286,9 @@ class InductorAdaptor(CompilerInterface):
 
     def compute_hash(self, vllm_config: VllmConfig) -> str:
         factors = get_inductor_factors()
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def initialize_cache(
@@ -299,7 +299,7 @@ def initialize_cache(
         self.base_cache_dir = cache_dir[: -len(prefix)] if prefix else cache_dir
         if disable_cache:
             return
-        # redirect the cache directory to a sub-directory
+        # redirect the cache directory to a subdirectory
         # set flags so that Inductor and Triton store their cache
         # in the cache_dir, then users only need to copy the cache_dir
         # to another machine to reuse the cache.
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 0946fa69171b..6d9da1c488c6 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -17,13 +17,14 @@
 
 import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (
     CompilationMode,
     VllmConfig,
     get_current_vllm_config,
     set_current_vllm_config,
 )
+from vllm.config.compilation import DynamicShapesType
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils.import_utils import resolve_obj_by_qualname
@@ -104,6 +105,7 @@ def support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]] | None = None,
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> Callable[[_T], _T] | _T:
     """
     A decorator to add support for compiling the forward method of a class.
@@ -159,8 +161,16 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): ...
 
     `mark_unbacked_dims` is a dictionary that maps argument names with a dynamic
     dim to be decorated with `mark_unbacked`.  This is useful if we would like to
-    enforce that dynamo do not specialize on 0/1 values in the case of dummy input
+    enforce that dynamo does not specialize on 0/1 values in the case of dummy input
     such as for vision model compilation
+
+    `shape_invariants` is a function that gets compiled right before forward.
+    The function should have the torch._check calls that are needed to set
+    the relationships between different input sizes. For example:
+            torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+    This enforces constraints on the symbolic shapes without hardcoding
+    specific values. It is needed for some models to avoid data dependent
+    errors.
     """
 
     def cls_decorator_helper(cls: _T) -> _T:
@@ -199,7 +209,11 @@ def cls_decorator_helper(cls: _T) -> _T:
                     f"Argument {k} not found in the forward method of {cls}"
                 )
         return _support_torch_compile(
-            cls, inferred_dynamic_arg_dims, mark_unbacked_dims, enable_if
+            cls,
+            inferred_dynamic_arg_dims,
+            mark_unbacked_dims,
+            enable_if,
+            shape_invariants,
         )
 
     if cls is not None:
@@ -242,18 +256,19 @@ def _support_torch_compile(
     dynamic_arg_dims: dict[str, int | list[int]],
     mark_unbacked_dims: dict[str, int | list[int]] | None = None,
     enable_if: Callable[[VllmConfig], bool] | None = None,
+    shape_invariants: Callable[..., None] = lambda *args, **kwargs: None,
 ) -> _T:
     """
     A decorator to add support for compiling the forward method of a class.
     """
-    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+    if TorchCompileWithNoGuardsWrapper in cls.__bases__:
         # support decorating multiple times
         return cls
 
     # take care of method resolution order
     # make sure super().__init__ is called on the base class
-    #  other than TorchCompileWrapperWithCustomDispatcher
-    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher,)
+    #  other than TorchCompileWithNoGuardsWrapper
+    cls.__bases__ = cls.__bases__ + (TorchCompileWithNoGuardsWrapper,)
 
     old_init = cls.__init__
 
@@ -276,11 +291,12 @@ def __init__(
         old_init(self, **kwargs)
 
         self.vllm_config = vllm_config
+        self.compilation_config = self.vllm_config.compilation_config
         enable_compile = enable_if is None or enable_if(vllm_config)
         # for CompilationMode.STOCK_TORCH_COMPILE , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
         self.do_not_compile = (
-            vllm_config.compilation_config.mode
+            self.compilation_config.mode
             in [CompilationMode.NONE, CompilationMode.STOCK_TORCH_COMPILE]
             or not supports_dynamo()
             or _should_ignore_torch_compile(self.__class__)
@@ -289,13 +305,53 @@ def __init__(
         if self.do_not_compile:
             return
 
+        self._check_shape_invariants = shape_invariants
+
         compilation_counter.num_models_seen += 1
-        TorchCompileWrapperWithCustomDispatcher.__init__(
-            self, compilation_mode=vllm_config.compilation_config.mode
-        )
+        self.compiled = False
+        TorchCompileWithNoGuardsWrapper.__init__(self)
 
     cls.__init__ = __init__
 
+    def _mark_dynamic_inputs(mod, type, *args, **kwargs):
+        def mark_dynamic(arg, dims):
+            if type == DynamicShapesType.UNBACKED:
+                torch._dynamo.decorators.mark_unbacked(arg, dims)
+            else:
+                torch._dynamo.mark_dynamic(arg, dims)
+
+        sig = inspect.signature(mod.__class__.forward)
+        bound_args = sig.bind(mod, *args, **kwargs)
+        bound_args.apply_defaults()
+        for k, dims in dynamic_arg_dims.items():
+            arg = bound_args.arguments.get(k)
+
+            if arg is not None:
+                dims = [dims] if isinstance(dims, int) else dims
+                if isinstance(arg, torch.Tensor):
+                    # In case dims is specified with negative indexing
+                    dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                    mark_dynamic(arg, dims)
+                elif isinstance(arg, IntermediateTensors):
+                    for tensor in arg.tensors.values():
+                        # In case dims is specified with negative indexing
+                        dims = [tensor.ndim + dim if dim < 0 else dim for dim in dims]
+                        mark_dynamic(tensor, dims)
+                else:
+                    raise ValueError(
+                        "Unsupported dynamic dimensions"
+                        f" {dims} for argument {k} with type {type(arg)}."
+                    )
+        if mark_unbacked_dims:
+            for k, dims in mark_unbacked_dims.items():
+                arg = bound_args.arguments.get(k)
+                if arg is not None:
+                    dims = [dims] if isinstance(dims, int) else dims
+                    if isinstance(arg, torch.Tensor):
+                        # In case dims is specified with negative indexing
+                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
+                        torch._dynamo.decorators.mark_unbacked(arg, dims)
+
     def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
@@ -303,9 +359,11 @@ def __call__(self, *args, **kwargs):
         if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
+        # if aot_compiled_fn is set, just call it.
         if getattr(self, "aot_compiled_fn", None) is not None:
             return self.aot_compiled_fn(self, *args, **kwargs)
 
+        ds_type = self.compilation_config.dynamic_shapes_config.type
         cache_dir = None
         aot_compilation_path = None
         if envs.VLLM_USE_AOT_COMPILE:
@@ -320,6 +378,14 @@ def __call__(self, *args, **kwargs):
             serialized backend artifacts), then we need to generate a new AOT
             compile artifact from scratch.
             """
+            # Validate that AOT compile is not used with unbacked dynamic
+            # shapes. aot_compile re-allocates backed symbols post dynamo!
+            if ds_type == DynamicShapesType.UNBACKED:
+                raise ValueError(
+                    "AOT compilation is not compatible with UNBACKED dynamic shapes. "
+                    "Please use BACKED or BACKED_SIZE_OBLIVIOUS dynamic shapes type "
+                    "when VLLM_USE_AOT_COMPILE is enabled."
+                )
             from .caching import compilation_config_hash_factors
 
             factors: list[str] = compilation_config_hash_factors(self.vllm_config)
@@ -362,120 +428,93 @@ def __call__(self, *args, **kwargs):
                 )
                 return self.aot_compiled_fn(self, *args, **kwargs)
 
+        if self.compiled:
+            assert not envs.VLLM_USE_AOT_COMPILE
+            return TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
+
+        # This is the path for the first compilation.
+
         # the first compilation needs to have dynamic shapes marked
-        if len(self.compiled_codes) < 1:
-            sig = inspect.signature(self.__class__.forward)
-            bound_args = sig.bind(self, *args, **kwargs)
-            bound_args.apply_defaults()
-            for k, dims in dynamic_arg_dims.items():
-                arg = bound_args.arguments.get(k)
-                if arg is not None:
-                    dims = [dims] if isinstance(dims, int) else dims
-                    if isinstance(arg, torch.Tensor):
-                        # In case dims is specified with negative indexing
-                        dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                        torch._dynamo.mark_dynamic(arg, dims)
-                    elif isinstance(arg, IntermediateTensors):
-                        for tensor in arg.tensors.values():
-                            # In case dims is specified with negative indexing
-                            dims = [
-                                tensor.ndim + dim if dim < 0 else dim for dim in dims
-                            ]
-                            torch._dynamo.mark_dynamic(tensor, dims)
-                    else:
-                        raise ValueError(
-                            "Unsupported dynamic dimensions"
-                            f" {dims} for argument {k} with type {type(arg)}."
-                        )
-            if mark_unbacked_dims:
-                for k, dims in mark_unbacked_dims.items():
-                    arg = bound_args.arguments.get(k)
-                    if arg is not None:
-                        dims = [dims] if isinstance(dims, int) else dims
-                        if isinstance(arg, torch.Tensor):
-                            # In case dims is specified with negative indexing
-                            dims = [arg.ndim + dim if dim < 0 else dim for dim in dims]
-                            torch._dynamo.decorators.mark_unbacked(arg, dims)
-            # here, it is the starting point of the `torch.compile` process
-            start_monitoring_torch_compile(self.vllm_config)
-            logger.debug("Start compiling function %s", self.original_code_object)
-
-        # if we don't use custom dispatcher, we can directly call the
-        # compiled function and let torch.compile handle the dispatching,
-        # with the overhead of guard evaluation and recompilation.
-        if len(self.compiled_codes) < 1 or not self.use_custom_dispatcher:
-            # it seems Dynamo reuse the compilation across instances,
-            # while we need to make sure the compiled code is not reused.
-            # we need to control all the compilation of the model.
-            torch._dynamo.eval_frame.remove_from_cache(self.original_code_object)
-
-            # collect all relevant files traced by Dynamo,
-            # so that the compilation cache can trigger re-compilation
-            # properly when any of these files change.
-
-            # 1. the file containing the top-level forward function
-            self.vllm_config.compilation_config.traced_files.add(
-                self.original_code_object.co_filename
-            )
+        _mark_dynamic_inputs(
+            self,
+            ds_type,
+            *args,
+            **kwargs,
+        )
 
-            # 2. every time Dynamo sees a function call, it will inline
-            # the function by calling InliningInstructionTranslator.inline_call_
-            # we hijack this function to know all the functions called
-            # during Dynamo tracing, and their corresponding files
-            inline_call = InliningInstructionTranslator.inline_call_
-
-            def patched_inline_call(self_):
-                code = self_.f_code
-                self.vllm_config.compilation_config.traced_files.add(code.co_filename)
-                return inline_call(self_)
-
-            # Disable the C++ compilation of symbolic shape guards. C++-fication
-            # of symbolic shape guards can improve guard overhead. But, since
-            # vllm skip guards anyways, setting this flag to False can improve
-            # compile time.
-            dynamo_config_patches = {}
-            try:
-                _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
-                dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False
-            except AttributeError:
-                # Note: this config is not available in torch 2.6, we can skip
-                # if the config doesn't exist
-                logger.debug("enable_cpp_symbolic_shape_guards config not available")
-
-            with (
-                patch.object(
-                    InliningInstructionTranslator, "inline_call_", patched_inline_call
-                ),
-                torch._dynamo.config.patch(**dynamo_config_patches),
-                maybe_use_cudagraph_partition_wrapper(self.vllm_config),
-                _torch27_patch_tensor_subclasses(),
-            ):
-                if envs.VLLM_USE_AOT_COMPILE:
-                    self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
-                    output = self.aot_compiled_fn(self, *args, **kwargs)
-                    assert aot_compilation_path is not None
-                    assert cache_dir is not None
-                    try:
-                        os.makedirs(cache_dir, exist_ok=True)
-                        self.aot_compiled_fn.save_compiled_function(
-                            aot_compilation_path
-                        )
-                    except Exception as e:
-                        logger.warning(
-                            "Cannot save aot compilation to path %s, error: %s",
-                            aot_compilation_path,
-                            str(e),
-                        )
-                else:
-                    output = self.compiled_callable(*args, **kwargs)
-            return output
+        # here, it is the starting point of the `torch.compile` process
+        start_monitoring_torch_compile(self.vllm_config)
+        original_code_object = self.original_code_object()
+        logger.debug("Start compiling function %s", original_code_object)
+
+        # we do not want tp delete the original code object entries since
+        # we depend on them now to look up cached compiled functions.
+        # torch._dynamo.eval_frame.remove_from_cache(original_code_object)
+
+        # collect all relevant files traced by Dynamo,
+        # so that the compilation cache can trigger re-compilation
+        # properly when any of these files change.
+
+        # 1. the file containing the top-level forward function
+        self.compilation_config.traced_files.add(original_code_object.co_filename)
+
+        # 2. every time Dynamo sees a function call, it will inline
+        # the function by calling InliningInstructionTranslator.inline_call_
+        # we hijack this function to know all the functions called
+        # during Dynamo tracing, and their corresponding files
+        inline_call = InliningInstructionTranslator.inline_call_
+
+        def patched_inline_call(self_):
+            code = self_.f_code
+            self.compilation_config.traced_files.add(code.co_filename)
+            return inline_call(self_)
+
+        # Disable the C++ compilation of symbolic shape guards. C++-fication
+        # of symbolic shape guards can improve guard overhead. But, since
+        # vllm skip guards anyways, setting this flag to False can improve
+        # compile time.
+        dynamo_config_patches = {}
+        try:
+            _ = torch._dynamo.config.enable_cpp_symbolic_shape_guards
+            dynamo_config_patches["enable_cpp_symbolic_shape_guards"] = False
+        except AttributeError:
+            # Note: this config is not available in torch 2.6, we can skip
+            # if the config doesn't exist
+            logger.debug("enable_cpp_symbolic_shape_guards config not available")
+
+        # Prepare backed_size_oblivious config patch if needed
+        fx_config_patches = {}
+        if ds_type == DynamicShapesType.BACKED_SIZE_OBLIVIOUS:
+            fx_config_patches["backed_size_oblivious"] = True
+
+        with (
+            patch.object(
+                InliningInstructionTranslator, "inline_call_", patched_inline_call
+            ),
+            torch._dynamo.config.patch(**dynamo_config_patches),
+            maybe_use_cudagraph_partition_wrapper(self.vllm_config),
+            torch.fx.experimental._config.patch(**fx_config_patches),
+            _torch27_patch_tensor_subclasses(),
+        ):
+            if envs.VLLM_USE_AOT_COMPILE:
+                self.aot_compiled_fn = self.aot_compile(*args, **kwargs)
+                output = self.aot_compiled_fn(self, *args, **kwargs)
+                assert aot_compilation_path is not None
+                assert cache_dir is not None
+                try:
+                    os.makedirs(cache_dir, exist_ok=True)
+                    self.aot_compiled_fn.save_compiled_function(aot_compilation_path)
+                except Exception as e:
+                    logger.warning(
+                        "Cannot save aot compilation to path %s, error: %s",
+                        aot_compilation_path,
+                        str(e),
+                    )
+            else:
+                output = TorchCompileWithNoGuardsWrapper.__call__(self, *args, **kwargs)
 
-        # usually, capturing the model once is enough, and then we can
-        # dispatch to the compiled code directly, without going through
-        # the Dynamo guard mechanism.
-        with self.dispatch_to_code(0):
-            model_output = self.forward(*args, **kwargs)
-            return model_output
+        self.compiled = True
+        return output
 
     cls.__call__ = __call__
     return cls
@@ -487,7 +526,7 @@ def maybe_use_cudagraph_partition_wrapper(vllm_config: VllmConfig):
     Context manager to set/unset customized cudagraph partition wrappers.
 
     If we're using Inductor-based graph partitioning, we currently have the
-    whole `fx.Graph` before Inductor lowering and and the piecewise
+    whole `fx.Graph` before Inductor lowering and the piecewise
     splitting happens after all graph passes and fusions. Here, we add
     a custom hook for Inductor to wrap each partition with our static
     graph wrapper class to maintain more control over static graph
diff --git a/vllm/compilation/fusion_attn.py b/vllm/compilation/fusion_attn.py
index 4f44faece75e..6dcbbd85d703 100644
--- a/vllm/compilation/fusion_attn.py
+++ b/vllm/compilation/fusion_attn.py
@@ -10,7 +10,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
diff --git a/vllm/compilation/matcher_utils.py b/vllm/compilation/matcher_utils.py
index 38eb4e5301a1..e4cd063d2aee 100644
--- a/vllm/compilation/matcher_utils.py
+++ b/vllm/compilation/matcher_utils.py
@@ -162,12 +162,10 @@ def forward_custom(
         weight: torch.Tensor,
     ) -> torch.Tensor:
         result = torch.empty_like(input)
-        # TODO: support non-contiguous input for RMSNorm and remove this
-        input_contiguous = input.contiguous()
         _, result = auto_functionalized(
             RMS_OP,
             result=result,
-            input=input_contiguous,
+            input=input,
             weight=weight,
             epsilon=self.epsilon,
         )
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index 0c2210d72ce0..fe2547d7feca 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -18,6 +18,7 @@
     from .fusion import RMSNormQuantFusionPass
     from .fusion_attn import AttnFusionPass
     from .qk_norm_rope_fusion import QKNormRoPEFusionPass
+    from .sequence_parallelism import SequenceParallelismPass
 
 if current_platform.is_cuda():
     from .collective_fusion import AllReduceFusionPass, AsyncTPPass
@@ -25,7 +26,6 @@
 from .fix_functionalization import FixFunctionalizationPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
 from .noop_elimination import NoOpEliminationPass
-from .sequence_parallelism import SequenceParallelismPass
 
 logger = init_logger(__name__)
 
@@ -127,7 +127,7 @@ def uuid(self):
         affects compilation caching. Its uuid depends on the UUIDs of all
         dependent passes and the pass config. See InductorPass for more info.
         """
-        state = {"pass_config": self.pass_config.uuid(), "passes": []}
+        state = {"pass_config": self.pass_config.compute_hash(), "passes": []}
         for pass_ in self.passes:
             state["passes"].append(pass_.uuid())
         state["passes"].append(self.fix_functionalization.uuid())
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 2931580afbbb..e535d2c461c6 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -107,7 +107,7 @@ def __call__(self, *args) -> Any:
             entry.runnable = self.vllm_backend.compiler_manager.compile(
                 self.graph,
                 args,
-                self.compilation_config.inductor_compile_config,
+                self.vllm_backend.inductor_config,
                 self.compilation_config,
                 graph_index=self.piecewise_compile_index,
                 num_graphs=self.total_piecewise_compiles,
diff --git a/vllm/compilation/qk_norm_rope_fusion.py b/vllm/compilation/qk_norm_rope_fusion.py
index e3c399e07906..794cd8e3fce5 100644
--- a/vllm/compilation/qk_norm_rope_fusion.py
+++ b/vllm/compilation/qk_norm_rope_fusion.py
@@ -9,7 +9,7 @@
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor.pattern_matcher import PatternMatcherPass
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index 31624a8fdcc0..cf4b8118f6b5 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import functools
+
 import torch
 import torch._inductor.pattern_matcher as pm
 import torch.fx as fx
@@ -10,98 +12,28 @@
 from vllm.distributed import get_tp_group, tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    kFp8StaticTensorSym,
+)
 from vllm.platforms import current_platform
 
 from .inductor_pass import enable_fake_mode
+from .matcher_utils import MatcherFusedAddRMSNorm, MatcherQuantFP8, MatcherRMSNorm
+from .noop_elimination import NoOpEliminationPass
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
 logger = init_logger(__name__)
 
 
-class _RMSNormAndQuantOpHelper:
-    """Base helper for RMSNorm and RMSNorm + Quantization functionalization."""
+def get_first_out_wrapper(fn):
+    @functools.wraps(fn)
+    def wrapper(*args):
+        return fn(*args)[0]
 
-    def __init__(
-        self,
-        epsilon: float,
-        dtype: torch.dtype,
-        device: str,
-        quant_op: torch._ops.OpOverload | None = None,
-        **kwargs,
-    ):
-        self.epsilon = epsilon
-        self.dtype = dtype
-        self.device = device
-        self.quant_op = quant_op
-
-    def _functional_rmsnorm(self, result_buffer, input_tensor, weight_tensor):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.rms_norm.default,
-            result=result_buffer,
-            input=input_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon,
-        )
+    return wrapper
 
-    def _functional_fused_add_rmsnorm(
-        self, input_tensor, residual_tensor, weight_tensor
-    ):
-        return torch.ops.higher_order.auto_functionalized(
-            torch.ops._C.fused_add_rms_norm.default,
-            input=input_tensor,
-            residual=residual_tensor,
-            weight=weight_tensor,
-            epsilon=self.epsilon,
-        )
 
-    def _functional_rmsnorm_then_quant(
-        self,
-        rmsnorm_result_buffer,
-        quant_result_buffer,
-        input_tensor,
-        weight_tensor,
-        scale_tensor,
-    ):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        rmsnorm_out_tuple = self._functional_rmsnorm(
-            rmsnorm_result_buffer, input_tensor, weight_tensor
-        )
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=rmsnorm_out_tuple[1],
-            scale=scale_tensor,
-        )
-        return quant_out_tuple
-
-    def _functional_fused_add_rmsnorm_then_quant(
-        self,
-        quant_result_buffer,
-        input_tensor,
-        residual_tensor,
-        weight_tensor,
-        scale_tensor,
-    ):
-        if self.quant_op is None:
-            raise RuntimeError(
-                "_RMSNormAndQuantOpHelper was not initialized with a quant_op."
-            )
-        fused_add_rmsnorm_out_tuple = self._functional_fused_add_rmsnorm(
-            input_tensor, residual_tensor, weight_tensor
-        )
-        quant_out_tuple = torch.ops.higher_order.auto_functionalized(
-            self.quant_op,
-            result=quant_result_buffer,
-            input=fused_add_rmsnorm_out_tuple[1],
-            scale=scale_tensor,
-        )
-        return quant_out_tuple, fused_add_rmsnorm_out_tuple[2]
-
-
-class _SequenceParallelPatternHelper(_RMSNormAndQuantOpHelper):
+class _SequenceParallelPatternHelper:
     """Helper for sequence parallelism patterns."""
 
     def __init__(
@@ -109,10 +41,10 @@ def __init__(
         epsilon: float,
         dtype: torch.dtype,
         device: str,
-        quant_op: torch._ops.OpOverload | None = None,
-        **kwargs,
     ):
-        super().__init__(epsilon, dtype, device, quant_op=quant_op, **kwargs)
+        self.epsilon = epsilon
+        self.dtype = dtype
+        self.device = device
         self.tp_group = get_tp_group()
         self.tp_size = get_tensor_model_parallel_world_size()
 
@@ -131,36 +63,34 @@ def _all_gather(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class FirstAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+
     def get_inputs(self):
         input = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        permute = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
         arg3_1 = torch.empty([4], device=self.device, dtype=self.dtype)
 
-        return [input, permute, arg3_1]
+        return [input, arg3_1]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            rmsnorm = self._functional_rmsnorm(permute, all_reduce, arg3_1)
+            rmsnorm = self.rmsnorm_matcher(all_reduce, arg3_1)
 
-            return rmsnorm[1], all_reduce
+            return rmsnorm, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            permute: torch.Tensor,
             arg3_1: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
 
-            rmsnorm_result = torch.empty_like(reduce_scatter)
-            rmsnorm = self._functional_rmsnorm(rmsnorm_result, reduce_scatter, arg3_1)
-
-            all_gather = self._all_gather(rmsnorm[1])
-
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, arg3_1)
+            all_gather = self._all_gather(rmsnorm)
             return all_gather, reduce_scatter
 
         pm.register_replacement(
@@ -169,6 +99,10 @@ def replacement(
 
 
 class MiddleAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
 
@@ -188,67 +122,34 @@ def pattern(
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights
-            )
-            return rmsnorm[1], rmsnorm[2]
+            rmsnorm = self.rmsnorm_matcher(all_reduce, rms_norm_weights, residual)
+            return rmsnorm[0], rmsnorm[1]
 
         def replacement(
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights
-            )
-            all_gather = self._all_gather(rmsnorm[1])
-            return all_gather, rmsnorm[2]
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rmsnorm = self.rmsnorm_matcher(reduce_scatter, rms_norm_weights, residual)
+            all_gather = self._all_gather(rmsnorm[0])
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, rmsnorm[1]
 
         pm.register_replacement(
             pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
-
-
-class LastAllReduceRMSNormPattern(_SequenceParallelPatternHelper):
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        return [
-            residual,
-            mm_1,
-            rms_norm_weights,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                all_reduce, residual, rms_norm_weights
-            )
-            return rmsnorm[1]
-
-        def replacement(
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            rmsnorm = self._functional_fused_add_rmsnorm(
-                reduce_scatter, residual, rms_norm_weights
-            )
-            normalized = self._all_gather(rmsnorm[1])
-            return normalized
-
         pm.register_replacement(
-            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
         )
 
 
@@ -257,52 +158,41 @@ def replacement(
 
 class FirstAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
     def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
+        self,
+        epsilon: float,
+        dtype: torch.dtype,
+        device: str,
     ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         input = torch.zeros([1, 8, 4], device=self.device, dtype=self.dtype)
-        rmsnorm_result = torch.empty([1, 8, 4], device=self.device, dtype=self.dtype)
-        quant_result = torch.empty([1, 8, 4], device=self.device, dtype=FP8_DTYPE)
         weight = torch.empty([4], device=self.device, dtype=self.dtype)
         scale = torch.tensor(1.0, device=self.device, dtype=torch.float32)
-        return [input, rmsnorm_result, quant_result, weight, scale]
+        return [input, weight, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             all_reduce = self._all_reduce(input)
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, all_reduce, weight, scale
-            )
-            return static_fp8[1], all_reduce
+            rms = self.rmsnorm_matcher(all_reduce, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, all_reduce
 
         def replacement(
             input: torch.Tensor,
-            rmsnorm_result: torch.Tensor,
-            quant_result: torch.Tensor,
             weight: torch.Tensor,
             scale: torch.Tensor,
         ):
             reduce_scatter = self._reduce_scatter(input)
-
-            rmsnorm_result = torch.empty_like(
-                reduce_scatter, dtype=rmsnorm_result.dtype
-            )
-            quant_result = torch.empty_like(
-                rmsnorm_result,  # Output of RMSNorm
-                dtype=quant_result.dtype,
-            )
-            static_fp8 = self._functional_rmsnorm_then_quant(
-                rmsnorm_result, quant_result, reduce_scatter, weight, scale
-            )
-            all_gather = self._all_gather(static_fp8[1])
+            rms = self.rmsnorm_matcher(reduce_scatter, weight)
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
 
             return all_gather, reduce_scatter
 
@@ -312,118 +202,64 @@ def replacement(
 
 
 class MiddleAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-    def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
-    ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
+    def __init__(self, epsilon: float, dtype: torch.dtype, device: str):
+        super().__init__(epsilon, dtype, device)
+        self.rmsnorm_matcher = MatcherFusedAddRMSNorm(epsilon)
+        self.quant_matcher = MatcherQuantFP8(kFp8StaticTensorSym)
 
     def get_inputs(self):
         mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
         residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
         rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
         scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
 
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
+        return [residual, mm_1, rms_norm_weights, scale]
 
     def register(self, pm_pass: PatternMatcherPass):
         def pattern(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             all_reduce = self._all_reduce(mm_1)
-            static_fp8, rmsnorm_residual_out = (
-                self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                    result, all_reduce, residual, rms_norm_weights, scale
-                )
+            rms, residual_out = self.rmsnorm_matcher(
+                all_reduce, rms_norm_weights, residual
             )
-            return static_fp8[1], rmsnorm_residual_out
+            quant, _ = self.quant_matcher(rms, scale)
+            return quant, residual_out
 
         def replacement(
-            result: torch.Tensor,
             residual: torch.Tensor,
             mm_1: torch.Tensor,
             rms_norm_weights: torch.Tensor,
             scale: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
+            # pattern matcher replaces from top-to-bottom,
+            # so residual is still the full size here.
+            # add a temporary slice which will become a noop
+            # once the seqpar pattern with the previous rmsnorm is replaced
             reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype)
-            static_fp8, rmsnorm_residual_out = (
-                self._functional_fused_add_rmsnorm_then_quant(  # noqa: E501
-                    quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale
-                )
+            residual = residual[0 : reduce_scatter.size(0), ...]
+            rms, residual_out = self.rmsnorm_matcher(
+                reduce_scatter, rms_norm_weights, residual
             )
-            all_gather = self._all_gather(static_fp8[1])
-            return all_gather, rmsnorm_residual_out
+            quant, _ = self.quant_matcher(rms, scale)
+            all_gather = self._all_gather(quant)
+            # shape of residual changes but that's fine,
+            # next node is already slicing it, now becomes a noop
+            return all_gather, residual_out
 
         pm.register_replacement(
             pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
         )
 
-
-class LastAllReduceRMSNormStaticFP8Pattern(_SequenceParallelPatternHelper):
-    def __init__(
-        self, epsilon: float, dtype: torch.dtype, device: str, op: torch._ops.OpOverload
-    ):
-        super().__init__(epsilon, dtype, device, quant_op=op)
-
-    def get_inputs(self):
-        mm_1 = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-
-        residual = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        rms_norm_weights = torch.empty([4, 4], device=self.device, dtype=self.dtype)
-        result = torch.empty([4, 4], device=self.device, dtype=FP8_DTYPE)
-        scale = torch.empty([1, 1], device=self.device, dtype=torch.float32)
-
-        return [
-            result,
-            residual,
-            mm_1,
-            rms_norm_weights,
-            scale,
-        ]
-
-    def register(self, pm_pass: PatternMatcherPass):
-        def pattern(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            all_reduce = self._all_reduce(mm_1)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                result, all_reduce, residual, rms_norm_weights, scale
-            )
-            return static_fp8[1]
-
-        def replacement(
-            result: torch.Tensor,
-            residual: torch.Tensor,
-            mm_1: torch.Tensor,
-            rms_norm_weights: torch.Tensor,
-            scale: torch.Tensor,
-        ) -> tuple[torch.Tensor, torch.Tensor]:
-            reduce_scatter = self._reduce_scatter(mm_1)
-            quant_result_buf = torch.empty_like(reduce_scatter, dtype=result.dtype)
-            static_fp8, _ = self._functional_fused_add_rmsnorm_then_quant(
-                quant_result_buf, reduce_scatter, residual, rms_norm_weights, scale
-            )
-            normalized = self._all_gather(static_fp8[1])
-            return normalized
-
         pm.register_replacement(
-            pattern, replacement, self.get_inputs(), pm.fwd_only, pm_pass
+            get_first_out_wrapper(pattern),
+            get_first_out_wrapper(replacement),
+            self.get_inputs(),
+            pm.fwd_only,
+            pm_pass,
         )
 
 
@@ -445,27 +281,45 @@ class SequenceParallelismPass(VllmPatternMatcherPass):
     GEMM + ReduceScatter and AllGather + GEMM fusions. These fusions can
     significantly reduce communication overhead and improve overall model
     performance.
+
+
+    This pass splits up the residual tensor across TP ranks and hence divides its size.
+    Because the pattern matcher starts at the end of the graph, the replacement
+    contains a slice that temporarily conforms the input residual to the correct size.
+    After all patterns have been matched, we use a NoOpEliminationPass to clean up
+    what have now become no-op slices.
+
+    Note that an older version of the pass did not need this as it operated only on
+    custom rms_norm and fused_rms_norm_add custom ops which did not complain about
+    mismatched shapes during replacement. So this approach has the same assumption that
+    correctness is only maintained if all rms_norm operations are split across ranks.
+
+    Correctness-wise, this is approach strictly better than before - before,
+    the graph was incorrect semantically and shape-wise during the pass.
+    With this approach there's only semantic incorrectness during the pass.
+    Both approaches restore a correct graph once all patterns are matched.
     """
 
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
         super().__init__(config)
 
+        # Used to clean up redundant views created temporarily
+        # to circumvent residual shape change issues
+        self.noop_cleanup = NoOpEliminationPass(config)
+        self.noop_cleanup.pass_name = f"{self.pass_name}.{self.noop_cleanup.pass_name}"
+
         self.patterns: PatternMatcherPass = PatternMatcherPass(
             pass_name="sequence_parallelism_pass"
         )
 
         for epsilon in [1e-5, 1e-6]:
             # RMSNorm + Static FP8 quantization patterns
-            fp8_quant_op = torch.ops._C.static_scaled_fp8_quant.default
             FirstAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
+                epsilon, self.model_dtype, self.device
             ).register(self.patterns)
             MiddleAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
-            ).register(self.patterns)
-            LastAllReduceRMSNormStaticFP8Pattern(
-                epsilon, self.model_dtype, self.device, fp8_quant_op
+                epsilon, self.model_dtype, self.device
             ).register(self.patterns)
 
             # Normal RMSNorm patterns
@@ -477,9 +331,6 @@ def __init__(self, config: VllmConfig):
                 epsilon, self.model_dtype, self.device
             ).register(self.patterns)
 
-            LastAllReduceRMSNormPattern(
-                epsilon, self.model_dtype, self.device
-            ).register(self.patterns)
         self.dump_patterns(config, self.patterns)
 
     def is_applicable(self, shape: int | None) -> bool:
@@ -508,3 +359,5 @@ def is_applicable(self, shape: int | None) -> bool:
     def __call__(self, graph: fx.Graph):
         self.matched_count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", self.matched_count)
+        # Clean up reshape nodes
+        self.noop_cleanup(graph)
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 4d26619bd128..b120c85bf232 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -4,11 +4,12 @@
 import os
 import sys
 from abc import abstractmethod
-from collections.abc import Callable
 from contextlib import contextmanager
 from types import CodeType
+from typing import Any
 
 import torch
+import torch._C._dynamo.guards
 
 import vllm.envs as envs
 from vllm.config import CompilationMode, CUDAGraphMode, get_current_vllm_config
@@ -17,88 +18,174 @@
 logger = init_logger(__name__)
 
 
-class TorchCompileWrapperWithCustomDispatcher:
+def _noop_add_global_state_guard(self, *args, **kwargs):
+    """No-op to skip the GLOBAL_STATE guard entirely"""
+    pass
+
+
+def _noop_add_torch_function_mode_stack_guard(self, *args, **kwargs):
+    """No-op to skip the TORCH_FUNCTION_MODE_STACK guard entirely"""
+    pass
+
+
+@contextmanager
+def _compilation_context():
+    """Context manager for compilation settings and patches.
+
+    This manager:
+    1. Sets higher dynamo cache limits for compilation. (Needed for
+        qwen2_5_vl see test_qwen2_5_vl_evs_functionality).
+        Generally a recompilation can happen whenever we use a new
+        backend instance in torch.compile.
+    2. Patches out add_global_state_guard to skip GLOBAL_STATE guards
+    3. Patches out add_torch_function_mode_stack_guard to skip
+        TORCH_FUNCTION_MODE_STACK guards.
+    4. Restores everything when compilation completes
+    """
+    # Save original values
+    original_global_state_guard = (
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard
+    )
+    original_torch_function_mode_stack_guard = (
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard
+    )
+    original_cache_size = torch._dynamo.config.cache_size_limit
+    original_accumulated_cache = torch._dynamo.config.accumulated_cache_size_limit
+
+    try:
+        # Set higher cache limits for compilation
+        torch._dynamo.config.cache_size_limit = 2048
+        torch._dynamo.config.accumulated_cache_size_limit = 8192
+
+        # Patch guard manager
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            _noop_add_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            _noop_add_torch_function_mode_stack_guard
+        )
+        yield
+    finally:
+        # Restore original values
+        torch._C._dynamo.guards.GuardManager.add_global_state_guard = (
+            original_global_state_guard
+        )
+        torch._C._dynamo.guards.GuardManager.add_torch_function_mode_stack_guard = (
+            original_torch_function_mode_stack_guard
+        )
+        torch._dynamo.config.cache_size_limit = original_cache_size
+        torch._dynamo.config.accumulated_cache_size_limit = original_accumulated_cache
+
+
+class TorchCompileWithNoGuardsWrapper:
     """
-    A wrapper class for torch.compile, with a custom dispatch logic.
-    Subclasses should:
-    1. Implement the forward method
-    2. Implement the dispatch logic in the __call__ method
-        It can use `self.compiled_codes` to access the compiled bytecode,
-        and `with self.dispatch_to_code(index):` to dispatch to
-        the compiled code.
-    3. Implement the `__init__` method to determine how to call
-        `torch.compile` over the forward method.
+    A wrapper class for torch.compile, it ensures that all guards are dropped
+    when CompilationMode is not CompilationMode.STOCK_TORCH_COMPILE.
+    When guards are dropped, the first time __call__ is invoked, a single
+    compilation is triggered. Dynamo should never be traced again after that
+    since we drop all guards.
     """
 
-    def __init__(
-        self,
-        compiled_callable: Callable | None = None,
-        compilation_mode: CompilationMode = CompilationMode.NONE,
-    ):
+    def check_invariants_and_forward(self, *args, **kwargs):
+        assert hasattr(self, "_check_shape_invariants")
+        self._check_shape_invariants(*args, **kwargs)
+
+        return self.forward(*args, **kwargs)
+
+    def __init__(self):
+        self.compiled = False
+
         vllm_config = get_current_vllm_config()
         self.vllm_config = vllm_config
-        if compiled_callable is None:
-            # default compilation settings
-            # compiling the forward method
-
-            backend = vllm_config.compilation_config.init_backend(vllm_config)
-            options = None
-            if isinstance(backend, str) and backend == "inductor":
-                options = (
-                    get_current_vllm_config().compilation_config.inductor_compile_config
+        mode = vllm_config.compilation_config.mode
+        if mode is None:
+            raise RuntimeError("Compilation mode cannot be NO_COMPILATION")
+
+        backend = vllm_config.compilation_config.init_backend(vllm_config)
+        options = {}
+
+        if isinstance(backend, str) and backend == "inductor":
+            options = vllm_config.compilation_config.inductor_compile_config
+
+        if mode != CompilationMode.STOCK_TORCH_COMPILE:
+            # Drop all the guards.
+            options["guard_filter_fn"] = lambda x: [False for _ in x]
+
+        # Validate that unbacked dynamic shapes require VLLM_USE_BYTECODE_HOOK=False
+        from vllm.compilation.decorators import DynamicShapesType
+
+        ds_type = vllm_config.compilation_config.dynamic_shapes_config.type
+        compiled_ptr: Any = self.forward
+        if ds_type == DynamicShapesType.UNBACKED:
+            if envs.VLLM_USE_BYTECODE_HOOK:
+                # reason is that bytecode does this hack torch._dynamo.eval_frame.
+                # remove_from_cache(self.original_code_object()) to force a new
+                # re-compilation.
+                raise ValueError(
+                    "UNBACKED dynamic shapes require VLLM_USE_BYTECODE_HOOK=0. "
                 )
-            if envs.VLLM_USE_AOT_COMPILE:
-                options = options or {}
-                # This effectively drop all the guards.
-                # We need this because bytecode hook is not used any more to
-                # drop guards in the AOT compile mode.
-                options["guard_filter_fn"] = lambda guards: [False for _ in guards]
-                if hasattr(torch._dynamo.config, "enable_aot_compile"):
-                    torch._dynamo.config.enable_aot_compile = True
-                else:
-                    msg = "torch._dynamo.config.enable_aot_compile is not "
-                    msg += "available. AOT compile is disabled and please "
-                    msg += "upgrade PyTorch version to use AOT compile."
-                    logger.warning(msg)
-
-            compiled_callable = torch.compile(
-                self.forward, fullgraph=True, backend=backend, options=options
-            )
+            compiled_ptr = self.check_invariants_and_forward
 
-        self.compiled_callable = compiled_callable
-        self.original_code_object = self.__class__.forward.__code__
-        self.compiled_codes: list[CodeType] = []
-        torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+        if envs.VLLM_USE_AOT_COMPILE:
+            if hasattr(torch._dynamo.config, "enable_aot_compile"):
+                torch._dynamo.config.enable_aot_compile = True
+            else:
+                msg = "torch._dynamo.config.enable_aot_compile is not "
+                msg += "available. AOT compile is disabled and please "
+                msg += "upgrade PyTorch version to use AOT compile."
+                logger.warning(msg)
 
-        # read the env var to determine whether to use the custom dispatcher
-        # subclasses can use this to switch between the custom dispatcher
-        # and the default Dynamo guard mechanism.
-        self.use_custom_dispatcher: bool = (
-            compilation_mode >= CompilationMode.DYNAMO_TRACE_ONCE
+        self._compiled_callable = torch.compile(
+            compiled_ptr,
+            fullgraph=True,
+            dynamic=False,
+            backend=backend,
+            options=options,
         )
 
+        if envs.VLLM_USE_BYTECODE_HOOK and mode != CompilationMode.STOCK_TORCH_COMPILE:
+            torch._dynamo.convert_frame.register_bytecode_hook(self.bytecode_hook)
+            self._compiled_bytecode = None
+
     def aot_compile(self, *args, **kwargs):
-        if not hasattr(self.compiled_callable, "aot_compile"):
+        if not hasattr(self._compiled_callable, "aot_compile"):
             raise RuntimeError(
                 "aot_compile is not supported by the current configuration. "
                 + "Please make sure torch.compile is enabled with the latest "
                 + f"version of PyTorch (current using torch: {torch.__version__})"
             )
-        return self.compiled_callable.aot_compile((args, kwargs))
+        return self._compiled_callable.aot_compile((args, kwargs))
 
     def __call__(self, *args, **kwargs):
-        """Implement the dispatch logic here, beyond the torch.compile mode.
-        NOTE: this function can have additional arguments beyond the forward
-         method, for directly dispatching to the compiled code.
-        """
-        return self.compiled_callable(*args, **kwargs)
+        if envs.VLLM_USE_BYTECODE_HOOK:
+            if (
+                self.vllm_config.compilation_config.mode
+                == CompilationMode.STOCK_TORCH_COMPILE
+            ):
+                return self._compiled_callable(*args, **kwargs)
+
+            if not self._compiled_bytecode:
+                # Make sure a compilation is triggered by clearing dynamo
+                # cache.
+                torch._dynamo.eval_frame.remove_from_cache(self.original_code_object())
+                return self._compiled_callable(*args, **kwargs)
+            else:
+                with self._dispatch_to_compiled_code():
+                    return self.forward(*args, **kwargs)
+        else:
+            with _compilation_context():
+                return self._compiled_callable(*args, **kwargs)
 
     @abstractmethod
     def forward(self, *args, **kwargs): ...
 
+    def original_code_object(self) -> CodeType:
+        """Return the original code object of the forward method."""
+        return self.__class__.forward.__code__
+
     def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         """Hook to save the compiled bytecode for direct execution."""
-        if old_code is not self.original_code_object:
+        if old_code is not self.original_code_object():
             return
         # code borrowed from https://github.com/thuml/depyf/blob/f4ad79fadee27ea113b4c75202db1eb1a11c0dbc/depyf/explain/enable_debugging.py#L25
         frame = sys._getframe()
@@ -114,7 +201,7 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
         if frame.f_locals["self"] is not self:
             return
 
-        self.compiled_codes.append(new_code)
+        self._compiled_bytecode = new_code
 
         path = self.vllm_config.compile_debug_dump_path()
         if path:
@@ -153,16 +240,21 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType):
             raise RuntimeError(msg)
 
     @contextmanager
-    def dispatch_to_code(self, index: int):
-        """Context manager to dispatch to the compiled code.
+    def _dispatch_to_compiled_code(self):
+        # noqa: E501
+        """
+        Context manager to dispatch to internally compiled code for torch<2.8.
         Why does this work? Because Dynamo guarantees that the compiled
         bytecode has exactly the same arguments, cell variables, and free
         variables as the original code. Therefore we can directly switch
         the code object in the function and call it.
 
-        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7
-        for more details.
-        """
-        self.__class__.forward.__code__ = self.compiled_codes[index]
-        yield
-        self.__class__.forward.__code__ = self.original_code_object
+        See https://dev-discuss.pytorch.org/t/what-is-the-relationship-requirement-among-original-bytecode-transformed-bytecode-and-bytecode-returned-by-hooks-in-dynamo/1693/7 for more details.
+        """  # noqa: E501 line too long
+        original = self.original_code_object()
+        assert self._compiled_bytecode is not None
+        self.__class__.forward.__code__ = self._compiled_bytecode
+        try:
+            yield
+        finally:
+            self.__class__.forward.__code__ = original
diff --git a/vllm/config/cache.py b/vllm/config/cache.py
index 864cf1be81b2..00530846fce0 100644
--- a/vllm/config/cache.py
+++ b/vllm/config/cache.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from dataclasses import field
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -74,8 +73,8 @@ class CacheConfig:
     sliding_window: int | None = None
     """Sliding window size for the KV cache. This is primarily set in
     `ModelConfig` and that value should be manually duplicated here."""
-    enable_prefix_caching: bool | None = None
-    """Whether to enable prefix caching. Enabled by default for V1."""
+    enable_prefix_caching: bool = True
+    """Whether to enable prefix caching."""
     prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
     """Set the hash algorithm for prefix caching:\n
     - "sha256" uses Pickle for object serialization before hashing.\n
@@ -145,7 +144,7 @@ class CacheConfig:
 
     kv_offloading_backend: KVOffloadingBackend | None = None
     """The backend to use for KV cache offloading. Supported backends include
-    'native' (vLLM native CPU offloading), 'lmcache' This option must be used 
+    'native' (vLLM native CPU offloading), 'lmcache' This option must be used
     together with kv_offloading_size."""
 
     def compute_hash(self) -> str:
@@ -160,13 +159,27 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.cache_dtype)
-        factors.append(self.mamba_cache_dtype)
-        factors.append(self.mamba_ssm_cache_dtype)
-        # `cpu_offload_gb` does not use `torch.compile` yet.
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
-        return hash_str
+        ignored_factors = {
+            # Runtime/derived knobs that don't affect compiled graph shape
+            "gpu_memory_utilization",
+            "swap_space",
+            "is_attention_free",
+            "num_gpu_blocks_override",
+            "enable_prefix_caching",
+            "prefix_caching_hash_algo",
+            "cpu_kvcache_space_bytes",
+            "mamba_page_size_padded",
+            # Post-init/derived counters
+            "num_gpu_blocks",
+            "num_cpu_blocks",
+            # WIP feature toggle not impacting compiled graph shape
+            "kv_sharing_fast_prefill",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
 
     def metrics_info(self):
         # convert cache_config to dict(key: str, value: str) for prometheus
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
index b0d1bc2bab30..da2c100dae3d 100644
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -2,14 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import enum
-import hashlib
 from collections import Counter
 from collections.abc import Callable
 from dataclasses import asdict, field
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
-from pydantic import TypeAdapter, field_validator
+from pydantic import Field, TypeAdapter, field_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
@@ -18,6 +17,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.import_utils import resolve_obj_by_qualname
+from vllm.utils.math_utils import round_up
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 if TYPE_CHECKING:
@@ -97,19 +97,25 @@ class PassConfig:
 
     This is separate from general `CompilationConfig` so that inductor passes
     don't all have access to full configuration - that would create a cycle as
-    the `PassManager` is set as a property of config."""
+    the `PassManager` is set as a property of config.
 
-    enable_fusion: bool = False
+    You must pass PassConfig to VLLMConfig constructor via the CompilationConfig
+    constructor. VLLMConfig's post_init does further initialization.
+    If used outside of the VLLMConfig, some fields may be left in an
+    improper state.
+    """
+
+    enable_fusion: bool = Field(default=None)
     """Whether to enable the custom fusion (RMSNorm/SiluMul+quant) pass."""
-    enable_attn_fusion: bool = False
+    enable_attn_fusion: bool = Field(default=None)
     """Whether to enable the custom attention+quant fusion pass."""
-    enable_noop: bool = False
+    enable_noop: bool = Field(default=None)
     """Whether to enable the custom no-op elimination pass."""
-    enable_sequence_parallelism: bool = False
+    enable_sequence_parallelism: bool = Field(default=None)
     """Whether to enable sequence parallelism."""
-    enable_async_tp: bool = False
+    enable_async_tp: bool = Field(default=None)
     """Whether to enable async TP."""
-    enable_fi_allreduce_fusion: bool = False
+    enable_fi_allreduce_fusion: bool = Field(default=None)
     """Whether to enable flashinfer allreduce fusion."""
     fi_allreduce_fusion_max_size_mb: float | None = None
     """The threshold of the communicated tensor sizes under which
@@ -159,7 +165,7 @@ def default_fi_allreduce_fusion_max_size_mb() -> dict[int, float]:
             current_platform.get_device_capability().to_int(), {}
         )
 
-    def uuid(self):
+    def compute_hash(self) -> str:
         """
         Produces a hash unique to the pass configuration.
         Any new fields that affect compilation should be added to the hash.
@@ -167,6 +173,22 @@ def uuid(self):
         """
         return InductorPass.hash_dict(asdict(self))
 
+    @field_validator(
+        "enable_fusion",
+        "enable_attn_fusion",
+        "enable_noop",
+        "enable_sequence_parallelism",
+        "enable_async_tp",
+        "enable_fi_allreduce_fusion",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if not self.enable_noop:
             if self.enable_fusion:
@@ -192,10 +214,64 @@ def __post_init__(self) -> None:
             self.enable_qk_norm_rope_fusion = False
 
 
+class DynamicShapesType(str, enum.Enum):
+    """Types of dynamic shapes handling in torch.compile().
+    see  Dynamic shapes and vllm guard dropping in torch_compile.md
+    for more details."""
+
+    BACKED = "backed"
+    """Use backed dynamic shapes. torch.compile() guards on backed dynamic
+    shapes and may add guards. Symbols are specialized to 0, 1, or >=2 even
+    without encountering branching on those ranges."""
+
+    UNBACKED = "unbacked"
+    """Use unbacked dynamic shapes. Guaranteed not to be guarded on and not
+    0/1 specialized, but may throw data dependent errors when branches require
+    their value without explicit unbacked handling."""
+
+    BACKED_SIZE_OBLIVIOUS = "backed_size_oblivious"
+    """Experimental flag that treats backed symbols as unbacked when explicit
+    unbacked handling is defined."""
+
+
+@config
+@dataclass
+class DynamicShapesConfig:
+    """Configuration to control/debug torch compile dynamic shapes."""
+
+    type: DynamicShapesType = DynamicShapesType.BACKED
+    """Controls the type of dynamic shapes handling to use with torch.compile().
+
+    - BACKED: Default PyTorch behavior with potential guards ignored.
+    - UNBACKED: No guards guaranteed (most sound) but may throw
+      data dependent errors.
+    - BACKED_SIZE_OBLIVIOUS: Experimental safer alternative to
+      backed/unbacked.
+    """
+
+    # TODO add a debug mode to fail
+
+    def compute_hash(self) -> str:
+        """
+        Provide a hash for DynamicShapesConfig
+        """
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, {})
+        return hash_factors(factors)
+
+
 @config
 @dataclass
 class CompilationConfig:
-    """Configuration for compilation. It has three parts:
+    """Configuration for compilation.
+
+    You must pass CompilationConfig to VLLMConfig constructor.
+    VLLMConfig's post_init does further initialization. If used outside of the
+    VLLMConfig, some fields will be left in an improper state.
+
+    It has three parts:
 
     - Top-level Compilation control:
         - [`mode`][vllm.config.CompilationConfig.mode]
@@ -216,7 +292,6 @@ class CompilationConfig:
         - [`cudagraph_copy_inputs`]
         [vllm.config.CompilationConfig.cudagraph_copy_inputs]
     - Inductor compilation:
-        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
         - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
         - [`inductor_compile_config`]
         [vllm.config.CompilationConfig.inductor_compile_config]
@@ -235,14 +310,14 @@ class CompilationConfig:
     """
 
     # Top-level Compilation control
-    level: int | None = None
+    level: int = Field(default=None)
     """
     Level is deprecated and will be removed in the next release,
     either 0.12.0 or 0.11.2 whichever is soonest.
     Please use mode. Currently all levels are mapped to mode.
     """
     # Top-level Compilation control
-    mode: CompilationMode | None = None
+    mode: CompilationMode = Field(default=None)
     """The compilation approach used for torch.compile-based compilation of the
     model.
 
@@ -283,9 +358,9 @@ class CompilationConfig:
     We use string to avoid serialization issues when using compilation in a
     distributed setting. When the compilation mode is 1 or 2, the backend is
     used for the compilation directly (it sees the whole graph). When the
-    compilation mode is 3, the backend is used for the piecewise compilation
-    (it sees a part of the graph). The backend can not be custom for compilation
-    mode 3, i.e. the backend must be either eager or inductor. Furthermore,
+    compilation mode is 3, the backend supports both whole graph and piecewise 
+    compilation, available backends include eager, inductor, and custom backends, 
+    the latter of which can be defined via `get_compile_backend`. Furthermore,
     compilation is only piecewise if splitting ops is set accordingly and
     use_inductor_graph_partition is off. Note that the default options for
     splitting ops are sufficient for piecewise compilation.
@@ -300,7 +375,7 @@ class CompilationConfig:
     - 'none,+op1,+op2' to enable only op1 and op2
 
     By default, all custom ops are enabled when running without Inductor and
-    disabled when running with Inductor: mode>=VLLM_COMPILE and use_inductor=True.
+    disabled when running with Inductor: mode>=VLLM_COMPILE and backend="inductor".
     Inductor generates (fused) Triton kernels for disabled custom ops."""
     splitting_ops: list[str] | None = None
     """A list of ops to exclude from cudagraphs, used in piecewise compilation.
@@ -320,36 +395,21 @@ class CompilationConfig:
 
     If None, defaults to attention ops for piecewise cudagraphs.
     If empty list [], no ops are excluded (suitable for full cudagraphs)."""
-    compile_mm_encoder: bool = True
+    compile_mm_encoder: bool = False
     """Whether or not to compile the multimodal encoder.
-    Currently, this only works for `Qwen2_5_vl`."""
+    Currently, this only works for `Qwen2_5_vl` on selected platforms.
+    Disabled by default until more models are supported/tested to work."""
 
     # Inductor capture
-    use_inductor: bool | None = None
-    """
-    Whether to use inductor compilation.
-
-    This flag is deprecated and will be removed in the next release 0.12.0.
-    Please use the 'backend' option instead.
-
-    - False: inductor compilation is not used. graph runs in eager
-        (custom_ops enabled by default).
-    - True: inductor compilation is used (custom_ops disabled by default).
-        One graph for symbolic shape and one graph per size in compile_sizes
-        are compiled using configurations in inductor_compile_config.
-
-    This setting is ignored if mode<VLLM_COMPILE.
-
-    For future compatibility:
-    If use_inductor is True, backend="inductor" otherwise backend="eager".
-    """
     compile_sizes: list[int | str] | None = None
     """Sizes to compile for inductor. In addition
     to integers, it also supports "cudagraph_capture_sizes" to
     specify the sizes for cudagraph capture."""
+
     inductor_compile_config: dict = field(default_factory=dict)
     """Additional configurations for inductor.
     - None: use default configurations."""
+
     inductor_passes: dict[str, str] = field(default_factory=dict)
     """Additional passes for inductor. It is a dictionary
     from pass name to pass function qualified name. We use function
@@ -358,7 +418,7 @@ class CompilationConfig:
     constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`."""
 
     # CudaGraph compilation
-    cudagraph_mode: CUDAGraphMode | None = None
+    cudagraph_mode: CUDAGraphMode = Field(default=None)
     """
     The mode of the cudagraph:
 
@@ -420,7 +480,7 @@ class CompilationConfig:
     When `enable_lora` is False, this option has no effect.
     """
 
-    use_inductor_graph_partition: bool = False
+    use_inductor_graph_partition: bool = Field(default=None)
     """Use inductor graph partition to split the graph at cudagraph_unsafe ops.
     This partition happens at inductor codegen time after all passes and fusions
     are finished. It generates a single `call` function which wraps
@@ -459,8 +519,15 @@ class CompilationConfig:
     max_num_seqs, and prevents capture of many large graphs (>512) that would
     greatly increase startup time with limited performance benefit.
     """
+
+    dynamic_shapes_config: DynamicShapesConfig = field(
+        default_factory=DynamicShapesConfig
+    )
+    """Configuration for dynamic shapes options"""
+
     local_cache_dir: str = field(default=None, init=False)  # type: ignore
     """local cache dir for each rank"""
+
     bs_to_padded_graph_size: list[int] = field(
         default=None,  # type: ignore
         init=False,
@@ -504,28 +571,34 @@ class CompilationConfig:
 
     def compute_hash(self) -> str:
         """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
         Provide a hash that uniquely identifies all the configs
         that affect the structure of the computation
         graph from input ids/embeddings to the final hidden states,
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.mode)
-        factors.append(self.backend)
-        factors.append(self.custom_ops)
-        factors.append(self.splitting_ops)
-        factors.append(self.use_inductor)
-        factors.append(self.use_inductor_graph_partition)
-        factors.append(self.inductor_compile_config)
-        factors.append(self.inductor_passes)
-        factors.append(self.pass_config.uuid())
-        factors.append(self.compile_cache_save_format)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        # Opt-out: default-include declared fields; keep a tiny exclude set;
+        # normalize types; keep SHA-256. For nested opaque configs, include a
+        # stable identifier (e.g., pass_config.compute_hash()) instead of object id.
+
+        ignored_factors = {
+            # Paths/dirs and runtime/metrics that don’t affect compiled graph
+            "debug_dump_path",
+            "cache_dir",
+            "local_cache_dir",
+            "bs_to_padded_graph_size",
+            "traced_files",
+            "compilation_time",
+            "static_forward_context",
+            "pass_config",  # handled separately below
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+
+        factors["pass_config"] = self.pass_config.compute_hash()
+        return hash_factors(factors)
 
     def __repr__(self) -> str:
         exclude = {
@@ -603,6 +676,20 @@ def validate_compile_cache_save_format(cls, value: str) -> str:
             )
         return value
 
+    @field_validator(
+        "level",
+        "mode",
+        "cudagraph_mode",
+        "use_inductor_graph_partition",
+        mode="wrap",
+    )
+    @classmethod
+    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
+        """Skip validation if the value is `None` when initialisation is delayed."""
+        if value is None:
+            return value
+        return handler(value)
+
     def __post_init__(self) -> None:
         if self.level is not None:
             logger.warning(
@@ -658,6 +745,8 @@ def __post_init__(self) -> None:
             is_torch_equal_or_newer("2.9.0.dev")
             and "combo_kernels" not in self.inductor_compile_config
             and "benchmark_combo_kernel" not in self.inductor_compile_config
+            # (fixme @boyuan) combo kernel does not support cpu yet.
+            and not current_platform.is_cpu()
         ):
             # use horizontal fusion, which is useful for fusing qk-norm and
             # qk-rope when query and key have different shapes.
@@ -693,16 +782,8 @@ def __post_init__(self) -> None:
                 f"Invalid backend for piecewise compilation: {self.backend}"
             )
 
-        if self.use_inductor is not None:
-            logger.warning_once(
-                "The 'use_inductor' flag is deprecated and will be "
-                "removed in the next release (v0.12.0). "
-                "Please use the 'backend' option instead.",
-            )
-            self.backend = "inductor" if self.use_inductor else "eager"
-
         if self.backend == "":
-            self.backend = current_platform.simple_compile_backend
+            self.backend = current_platform.get_compile_backend()
 
     def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
         """
@@ -734,9 +815,7 @@ def init_backend(self, vllm_config: "VllmConfig") -> str | Callable:
 
         assert self.mode == CompilationMode.VLLM_COMPILE
         if self.backend not in ["eager", "inductor"]:
-            raise ValueError(
-                f"Invalid backend for piecewise compilation: {self.backend}"
-            )
+            logger.info("Using OOT custom backend for compilation.")
 
         from vllm.compilation.backends import VllmBackend
 
@@ -772,19 +851,8 @@ def post_init_cudagraph_sizes(self) -> None:
         if self.cudagraph_capture_sizes:
             assert self.cudagraph_capture_sizes[-1] == self.max_cudagraph_capture_size
 
-        # pre-compute the mapping from batch size to padded graph size
-        self.bs_to_padded_graph_size = [
-            0 for i in range(self.max_cudagraph_capture_size + 1)
-        ]
-        for end, start in zip(
-            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
-            [0] + self.cudagraph_capture_sizes,
-        ):
-            for bs in range(start, end):
-                if bs == start:
-                    self.bs_to_padded_graph_size[bs] = start
-                else:
-                    self.bs_to_padded_graph_size[bs] = end
+        # May get recomputed in the model runner if adjustment is needed for spec-decode
+        self.compute_bs_to_padded_graph_size()
 
     def set_splitting_ops_for_v1(self):
         # NOTE: this function needs to be called only when mode is
@@ -921,3 +989,75 @@ def custom_op_log_check(self):
                     enable_str,
                     op,
                 )
+
+    def is_custom_op_enabled(self, op: str) -> bool:
+        if "all" in self.custom_ops:
+            return f"-{op}" not in self.custom_ops
+
+        assert "none" in self.custom_ops
+        return f"+{op}" in self.custom_ops
+
+    def adjust_cudagraph_sizes_for_spec_decode(
+        self, uniform_decode_query_len: int, tensor_parallel_size: int
+    ):
+        multiple_of = uniform_decode_query_len
+        if tensor_parallel_size > 1 and self.pass_config.enable_sequence_parallelism:
+            multiple_of = max(uniform_decode_query_len, tensor_parallel_size)
+            if (
+                multiple_of % uniform_decode_query_len != 0
+                or multiple_of % tensor_parallel_size != 0
+            ):
+                raise ValueError(
+                    f"Can't determine cudagraph shapes that are both a "
+                    f"multiple of {uniform_decode_query_len} "
+                    f"(num_speculative_tokens + 1) required by spec-decode "
+                    f"and {tensor_parallel_size} (tensor_parallel_size) "
+                    f"required by sequence parallelism please adjust "
+                    f"num_speculative_tokens or disable sequence parallelism"
+                )
+
+        if not self.cudagraph_capture_sizes or multiple_of <= 1:
+            return
+
+        assert self.max_cudagraph_capture_size is not None
+        rounded_sizes = sorted(
+            set(
+                round_up(size, multiple_of)
+                for size in self.cudagraph_capture_sizes
+                if round_up(size, multiple_of) <= self.max_cudagraph_capture_size
+            )
+        )
+
+        if len(rounded_sizes) == 0 and multiple_of <= self.max_cudagraph_capture_size:
+            # if one valid but would be round_down use that
+            rounded_sizes = [multiple_of]
+
+        if len(rounded_sizes) == 0:
+            raise ValueError(
+                f"No valid cudagraph sizes after rounding to multiple of {multiple_of} "
+                f"(num_speculative_tokens + 1 or tp if sequence parallelism is enabled)"
+                f" please adjust num_speculative_tokens ({uniform_decode_query_len - 1}"
+                f") or max_cudagraph_capture_size ({self.max_cudagraph_capture_size})"
+                f" or cudagraph_capture_sizes ({self.cudagraph_capture_sizes})"
+            )
+
+        self.max_cudagraph_capture_size = rounded_sizes[-1]
+        self.cudagraph_capture_sizes = rounded_sizes
+
+        # Recompute after adjusting the cudagraph sizes
+        self.compute_bs_to_padded_graph_size()
+
+    def compute_bs_to_padded_graph_size(self):
+        # pre-compute the mapping from batch size to padded graph size
+        self.bs_to_padded_graph_size = [
+            0 for i in range(self.max_cudagraph_capture_size + 1)
+        ]
+        for end, start in zip(
+            self.cudagraph_capture_sizes + [self.max_cudagraph_capture_size + 1],
+            [0] + self.cudagraph_capture_sizes,
+        ):
+            for bs in range(start, end):
+                if bs == start:
+                    self.bs_to_padded_graph_size[bs] = start
+                else:
+                    self.bs_to_padded_graph_size[bs] = end
diff --git a/vllm/config/device.py b/vllm/config/device.py
index e85cd15de8cf..85662ddff76b 100644
--- a/vllm/config/device.py
+++ b/vllm/config/device.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from dataclasses import field
 from typing import Any, Literal
 
@@ -10,6 +9,7 @@
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
 
@@ -45,7 +45,7 @@ def compute_hash(self) -> str:
         # the device/platform information will be summarized
         # by torch/vllm automatically.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self):
diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py
index dfd7ef63712a..88f8b91c292b 100644
--- a/vllm/config/kv_transfer.py
+++ b/vllm/config/kv_transfer.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import uuid
 from dataclasses import field
 from typing import Any, Literal, get_args
@@ -9,6 +8,7 @@
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 KVProducer = Literal["kv_producer", "kv_both"]
 KVConsumer = Literal["kv_consumer", "kv_both"]
@@ -79,7 +79,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def __post_init__(self) -> None:
diff --git a/vllm/config/load.py b/vllm/config/load.py
index e424f8c5edb6..579a0bc31020 100644
--- a/vllm/config/load.py
+++ b/vllm/config/load.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import TYPE_CHECKING, Any
 
 from pydantic import Field, field_validator
@@ -9,6 +8,7 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader import LoadFormats
@@ -104,7 +104,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("load_format", mode="after")
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
index 84e92eef4007..6a8fd6359aad 100644
--- a/vllm/config/lora.py
+++ b/vllm/config/lora.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
-from typing import TYPE_CHECKING, Any, ClassVar, Literal
+from typing import TYPE_CHECKING, Any, Literal
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
@@ -11,7 +10,7 @@
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.utils.hashing import safe_hash
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -46,19 +45,6 @@ class LoRAConfig:
     `max_loras`."""
     lora_dtype: torch.dtype | LoRADType = "auto"
     """Data type for LoRA. If auto, will default to base model dtype."""
-    lora_extra_vocab_size: LoRAExtraVocabSize = Field(
-        default=256,
-        deprecated=(
-            "`lora_extra_vocab_size` is deprecated and will be removed "
-            "in v0.12.0. Additional vocabulary support for "
-            "LoRA adapters is being phased out."
-        ),
-    )
-    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
-    LoRA adapter. Will be removed in v0.12.0."""
-    lora_vocab_padding_size: ClassVar[int] = (
-        current_platform.get_lora_vocab_padding_size()
-    )
     default_mm_loras: dict[str, str] | None = None
     """Dictionary mapping specific modalities to LoRA model paths; this field
     is only applicable to multimodal models and should be leveraged when a
@@ -87,10 +73,8 @@ def compute_hash(self) -> str:
         factors.append(self.max_loras)
         factors.append(self.fully_sharded_loras)
         factors.append(self.lora_dtype)
-        factors.append(self.lora_extra_vocab_size)
-        factors.append(self.lora_vocab_padding_size)
 
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @model_validator(mode="after")
diff --git a/vllm/config/model.py b/vllm/config/model.py
index c47b619118ff..ef592ac00153 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
-import json
 import warnings
 from collections.abc import Callable
 from dataclasses import InitVar, field
@@ -13,12 +11,14 @@
 from pydantic import ConfigDict, SkipValidation, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
-from vllm.config.utils import assert_hashable, config, getattr_iter
+from vllm.config.utils import config, getattr_iter
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
@@ -34,9 +34,18 @@
     try_get_safetensors_metadata,
     try_get_tokenizer_config,
     uses_mrope,
+    uses_xdrope_dim,
+)
+from vllm.transformers_utils.gguf_utils import (
+    maybe_patch_hf_config_from_gguf,
 )
 from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
-from vllm.transformers_utils.utils import maybe_model_redirect
+from vllm.transformers_utils.utils import (
+    is_gguf,
+    is_remote_gguf,
+    maybe_model_redirect,
+    split_remote_gguf,
+)
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.torch_utils import common_broadcastable_dtype
 
@@ -45,7 +54,6 @@
 
     import vllm.model_executor.layers.quantization as me_quant
     import vllm.model_executor.models as me_models
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config.load import LoadConfig
     from vllm.config.parallel import ParallelConfig
     from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -53,7 +61,6 @@
 else:
     PretrainedConfig = Any
 
-    AttentionBackendEnum = Any
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
     )
@@ -79,7 +86,7 @@
     "transcription",
     "draft",
 ]
-TokenizerMode = Literal["auto", "slow", "mistral", "custom"]
+TokenizerMode = Literal["auto", "hf", "slow", "mistral"]
 ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
 LogprobsMode = Literal[
     "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@@ -100,6 +107,10 @@
     "draft": [],
 }
 
+AttnTypeStr = Literal[
+    "decoder", "encoder", "encoder_only", "encoder_decoder", "attention_free", "hybrid"
+]
+
 
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
@@ -126,12 +137,13 @@ class ModelConfig:
     tokenizer: SkipValidation[str] = None  # type: ignore
     """Name or path of the Hugging Face tokenizer to use. If unspecified, model
     name or path will be used."""
-    tokenizer_mode: TokenizerMode = "auto"
+    tokenizer_mode: TokenizerMode | str = "auto"
     """Tokenizer mode:\n
-    - "auto" will use the fast tokenizer if available.\n
+    - "auto" will use "hf" tokenizer if Mistral's tokenizer is not available.\n
+    - "hf" will use the fast tokenizer if available.\n
     - "slow" will always use the slow tokenizer.\n
     - "mistral" will always use the tokenizer from `mistral_common`.\n
-    - "custom" will use --tokenizer to select the preregistered tokenizer."""
+    - Other custom values can be supported via plugins."""
     trust_remote_code: bool = False
     """Trust remote code (e.g., from HuggingFace) when downloading the model
     and tokenizer."""
@@ -144,9 +156,12 @@ class ModelConfig:
     - "bfloat16" for a balance between precision and range.\n
     - "float" is shorthand for FP32 precision.\n
     - "float32" for FP32 precision."""
-    seed: int | None = None
-    """Random seed for reproducibility. Initialized to None in V0, but
-    initialized to 0 in V1."""
+    seed: int = 0
+    """Random seed for reproducibility.
+
+    We must set the global seed because otherwise,
+    different tensor parallel workers would sample different tokens,
+    leading to inconsistent results."""
     hf_config: PretrainedConfig = field(init=False)
     """The Hugging Face config of the model."""
     hf_text_config: PretrainedConfig = field(init=False)
@@ -236,8 +251,8 @@ class ModelConfig:
     first one."""
     config_format: str | ConfigFormat = "auto"
     """The format of the model config to load:\n
-    - "auto" will try to load the config in hf format if available else it
-    will try to load in mistral format.\n
+    - "auto" will try to load the config in hf format if available after trying
+    to load in mistral format.\n
     - "hf" will load the config in hf format.\n
     - "mistral" will load the config in mistral format."""
     hf_token: bool | str | None = None
@@ -287,9 +302,6 @@ class ModelConfig:
     pooler_config: PoolerConfig | None = None
     """Pooler config which controls the behaviour of output pooling in pooling
     models."""
-    override_pooler_config: dict | PoolerConfig | None = None
-    """[DEPRECATED] Use `pooler_config` instead. This field will be removed in
-    v0.12.0 or v1.0.0, whichever is sooner."""
 
     # Multimodal config and init vars
     multimodal_config: MultiModalConfig | None = None
@@ -320,50 +332,47 @@ def compute_hash(self) -> str:
         excluding anything before input ids/embeddings and after
         the final hidden states.
         """
-        factors: list[Any] = []
-        factors.append(self.model)
-        factors.append(self.dtype)
-        factors.append(self.quantization)
-        factors.append(self.revision)
-        factors.append(self.code_revision)
-        factors.append(self.max_model_len)
-        factors.append(self.max_logprobs)
-        factors.append(self.disable_sliding_window)
-        factors.append(self.trust_remote_code)
-        factors.append(self.generation_config)
-        factors.append(self.model_impl)
-        factors.append(self.override_generation_config)
-        factors.append(self.video_pruning_rate)
-        factors.append(self.enable_prompt_embeds)
-
-        # hf_config can control how the model looks!
-        try:
-            hf_config_json = self.hf_config.to_json_string(use_diff=False)
-        except TypeError:
-            from transformers import PretrainedConfig
-
-            from vllm.utils.jsontree import json_map_leaves
-
-            # Handle nested HF configs with unserializable values gracefully
-            hf_config_json = (
-                json.dumps(
-                    json_map_leaves(
-                        lambda v: v.to_dict()
-                        if isinstance(v, PretrainedConfig)
-                        else str(v),
-                        self.hf_config.to_dict(),
-                    ),
-                    indent=2,
-                    sort_keys=True,
-                )
-                + "\n"
-            )
-
-        factors.append(hf_config_json)
-
-        str_factors = str(factors)
-        assert_hashable(str_factors)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        ignored_factors = {
+            "runner",
+            "convert",
+            "task",
+            "tokenizer",
+            "tokenizer_mode",
+            "seed",
+            "hf_config_path",
+            "allowed_local_media_path",
+            "allowed_media_domains",
+            "tokenizer_revision",
+            "spec_target_max_model_len",
+            "enforce_eager",
+            "logprobs_mode",
+            "disable_cascade_attn",
+            "skip_tokenizer_init",
+            "served_model_name",
+            "config_format",
+            "hf_token",
+            "hf_overrides",
+            "logits_processor_pattern",
+            "override_attention_dtype",
+            "logits_processors",
+            "io_processor_plugin",
+            "pooler_config",
+            "multimodal_config",
+            "limit_mm_per_prompt",
+            "media_io_kwargs",
+            "mm_processor_kwargs",
+            "mm_processor_cache_gb",
+            "mm_processor_cache_type",
+            "mm_shm_cache_max_object_size_mb",
+            "mm_encoder_tp_mode",
+            "interleave_mm_strings",
+            "skip_mm_profiling",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        return hash_factors(factors)
 
     def _update_nested(
         self,
@@ -413,7 +422,7 @@ def _apply_dict_overrides(
     def __post_init__(
         self,
         # Multimodal config init vars
-        limit_mm_per_prompt: dict[str, int] | None,
+        limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
         enable_mm_embeds: bool | None,
         media_io_kwargs: dict[str, dict[str, Any]] | None,
         mm_processor_kwargs: dict[str, Any] | None,
@@ -426,23 +435,6 @@ def __post_init__(
         skip_mm_profiling: bool | None,
         video_pruning_rate: float | None,
     ) -> None:
-        # Set the default seed to 0 in V1.
-        # NOTE(woosuk): In V1, we use separate processes for workers (unless
-        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
-        # doesn't affect the user process. However, without a consistent seed,
-        # different tensor parallel workers would sample different tokens,
-        # leading to inconsistent results.
-        if self.seed is None:
-            self.seed = 0
-            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
-                logger.warning(
-                    "The global random seed is set to %d. Since "
-                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
-                    "affect the random state of the Python process that "
-                    "launched vLLM.",
-                    self.seed,
-                )
-
         # Keep set served_model_name before maybe_model_redirect(self.model)
         self.served_model_name = get_served_model_name(
             self.model, self.served_model_name
@@ -508,6 +500,10 @@ def __post_init__(
             hf_overrides_kw=hf_overrides_kw,
             hf_overrides_fn=hf_overrides_fn,
         )
+        hf_config = maybe_patch_hf_config_from_gguf(
+            self.model,
+            hf_config,
+        )
 
         self.hf_config = hf_config
         if dict_overrides:
@@ -586,16 +582,26 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
                 else:  # task == "auto"
                     pass
             else:
-                debug_info = {
-                    "architectures": architectures,
-                    "is_generative_model": is_generative_model,
-                    "is_pooling_model": is_pooling_model,
-                }
-                raise AssertionError(
-                    "The model should be a generative or "
-                    "pooling model when task is set to "
-                    f"{self.task!r}. Found: {debug_info}"
-                )
+                # Neither generative nor pooling model - try to convert if possible
+                if is_pooling_task:
+                    runner = "pooling"
+                    convert = _task_to_convert(self.task)
+                    msg_hint = (
+                        "Please replace this option with `--runner pooling "
+                        f"--convert {convert}` to continue using this model "
+                        "as a pooling model."
+                    )
+                else:
+                    debug_info = {
+                        "architectures": architectures,
+                        "is_generative_model": is_generative_model,
+                        "is_pooling_model": is_pooling_model,
+                    }
+                    raise AssertionError(
+                        "The model should be a generative or "
+                        "pooling model when task is set to "
+                        f"{self.task!r}. Found: {debug_info}"
+                    )
 
             self.runner = runner
             self.convert = convert
@@ -632,18 +638,6 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
 
         # Init pooler config if needed
         if self.runner_type == "pooling":
-            if self.override_pooler_config is not None:
-                logger.warning_once(
-                    "`override_pooler_config` is deprecated and will be "
-                    "removed in v0.12.0 or v1.0.0, whichever is sooner. "
-                    "Please use `pooler_config` instead."
-                )
-
-                if isinstance(self.override_pooler_config, dict):
-                    self.pooler_config = PoolerConfig(**self.override_pooler_config)
-                else:
-                    self.pooler_config = self.override_pooler_config
-
             if self.pooler_config is None:
                 self.pooler_config = PoolerConfig()
 
@@ -701,14 +695,19 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
 
             self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
 
+        # Multimodal GGUF models must use original repo for mm processing
+        if is_gguf(self.tokenizer) and self.is_multimodal_model:
+            raise ValueError(
+                "Loading a multimodal GGUF model needs to use original "
+                "tokenizer. Please specify the unquantized hf model's "
+                "repo name or path using the --tokenizer argument."
+            )
+
         if self.disable_sliding_window:
             # Set after get_and_verify_max_len to ensure that max_model_len
             # can be correctly capped to sliding window size
             self.hf_text_config.sliding_window = None
 
-        if not self.skip_tokenizer_init:
-            self._verify_tokenizer_mode()
-
         # Avoid running try_verify_and_update_config multiple times
         self.config_updated = False
 
@@ -716,6 +715,10 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
         self._verify_cuda_graph()
         self._verify_bnb_config()
 
+    @field_validator("tokenizer_mode", mode="after")
+    def _lowercase_tokenizer_mode(cls, tokenizer_mode: str) -> str:
+        return tokenizer_mode.lower()
+
     @field_validator("quantization", mode="before")
     @classmethod
     def validate_quantization_before(cls, value: Any) -> Any:
@@ -732,7 +735,7 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
         return self
 
     def _get_transformers_backend_cls(self) -> str:
-        """Determine which Transformers backend class will be used if
+        """Determine which Transformers modeling backend class will be used if
         `model_impl` is set to `transformers` or `auto`."""
         cls = "Transformers"
         # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal
@@ -746,8 +749,8 @@ def _get_transformers_backend_cls(self) -> str:
         # User specified value take precedence
         if self.runner != "auto":
             runner = self.runner
-        # Only consider Transformers backend pooling classes if we're wrapping an
-        # architecture that defaults to pooling. Otherwise, we return the LM class
+        # Only consider Transformers modeling backend pooling classes if we're wrapping
+        # an architecture that defaults to pooling. Otherwise, we return the LM class
         # and use adapters.
         if runner == "pooling" and task in {"embed", "classify"}:
             if task == "embed":
@@ -759,7 +762,7 @@ def _get_transformers_backend_cls(self) -> str:
         return cls
 
     def using_transformers_backend(self) -> bool:
-        """Check if the model is using the Transformers backend class."""
+        """Check if the model is using the Transformers modeling backend class."""
         used_cls = self._model_info.architecture
         transformers_backend_cls = self._get_transformers_backend_cls()
         return used_cls == transformers_backend_cls
@@ -822,16 +825,10 @@ def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> No
             self.tokenizer = object_storage_tokenizer.dir
 
     def _get_encoder_config(self):
-        return get_sentence_transformer_tokenizer_config(self.model, self.revision)
-
-    def _verify_tokenizer_mode(self) -> None:
-        tokenizer_mode = cast(TokenizerMode, self.tokenizer_mode.lower())
-        if tokenizer_mode not in get_args(TokenizerMode):
-            raise ValueError(
-                f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
-                f"one of {get_args(TokenizerMode)}."
-            )
-        self.tokenizer_mode = tokenizer_mode
+        model = self.model
+        if is_remote_gguf(model):
+            model, _ = split_remote_gguf(model)
+        return get_sentence_transformer_tokenizer_config(model, self.revision)
 
     def _get_default_runner_type(
         self,
@@ -1006,6 +1003,8 @@ def _verify_quantization(self) -> None:
                 # Ensure heavy backends are probed last to avoid unnecessary
                 # imports during override detection (e.g., MXFP4 imports Triton)
                 "mxfp4",
+                "cpu_gptq",
+                "cpu_awq",
             ]
             quantization_methods = [
                 q for q in supported_quantization if q not in overrides
@@ -1137,12 +1136,6 @@ def verify_with_parallel_config(
         self,
         parallel_config: ParallelConfig,
     ) -> None:
-        if parallel_config.distributed_executor_backend == "external_launcher":
-            assert self.seed is not None, (
-                "Seed must be set when using external launcher backend to "
-                "make sure sampling results are the same across workers."
-            )
-
         total_num_attention_heads = getattr(
             self.hf_text_config, "num_attention_heads", 0
         )
@@ -1183,6 +1176,14 @@ def verify_with_parallel_config(
                 f"but got {decode_context_parallel_size}"
             )
 
+            num_q_per_kv = total_num_attention_heads // total_num_kv_heads
+            assert num_q_per_kv % decode_context_parallel_size == 0, (
+                f"Total number of q per kv attn heads ({num_q_per_kv})"
+                " must be divisible by dcp world size when enable "
+                "decode context parallel for GQA "
+                f"({parallel_config.decode_context_parallel_size})."
+            )
+
     def get_sliding_window(self) -> int | None:
         """Get the sliding window size from the HF text config if present."""
         return getattr(self.hf_text_config, "sliding_window", None)
@@ -1193,6 +1194,16 @@ def get_vocab_size(self) -> int:
     def get_hidden_size(self) -> int:
         return getattr(self.hf_text_config, "hidden_size", 0)
 
+    def get_inputs_embeds_size(self) -> int:
+        # The size of inputs_embeds is usually identical to the size
+        # of the hidden states, however there are exceptions, such as
+        # embedding models like CLIP and SigLIP
+        for target_attr in ("projection_dim", "projection_size"):
+            if hasattr(self.hf_text_config, target_attr):
+                return getattr(self.hf_text_config, target_attr)
+
+        return self.get_hidden_size()
+
     @property
     def is_deepseek_mla(self) -> bool:
         if not hasattr(self.hf_text_config, "model_type"):
@@ -1342,13 +1353,10 @@ def get_num_experts(self) -> int:
             # Ernie VL's remote code uses list[int]...
             # The values are always the same so we just take the first one.
             return num_experts[0]
-        return num_experts
-
-    def get_layers_start_end_indices(
-        self, parallel_config: ParallelConfig
-    ) -> tuple[int, int]:
-        from vllm.distributed.utils import get_pp_indices
+        # Coerce to 0 if explicitly set to None
+        return num_experts or 0
 
+    def get_total_num_hidden_layers(self) -> int:
         if (
             self.hf_text_config.model_type == "deepseek_mtp"
             or self.hf_config.model_type == "mimo_mtp"
@@ -1368,6 +1376,15 @@ def get_layers_start_end_indices(
             total_num_hidden_layers = getattr(
                 self.hf_text_config, "num_hidden_layers", 0
             )
+        return total_num_hidden_layers
+
+    def get_layers_start_end_indices(
+        self, parallel_config: ParallelConfig
+    ) -> tuple[int, int]:
+        from vllm.distributed.utils import get_pp_indices
+
+        total_num_hidden_layers = self.get_total_num_hidden_layers()
+
         # the layout order is: DP x PP x TP
         pp_rank = (
             parallel_config.rank // parallel_config.tensor_parallel_size
@@ -1596,6 +1613,10 @@ def uses_alibi(self) -> bool:
     def uses_mrope(self) -> bool:
         return uses_mrope(self.hf_config)
 
+    @property
+    def uses_xdrope_dim(self) -> int:
+        return uses_xdrope_dim(self.hf_config)
+
     @property
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
@@ -1697,19 +1718,12 @@ def head_dtype(self) -> torch.dtype:
         logger.debug_once("head dtype: %s", head_dtype)
         return head_dtype
 
-    @property
-    def hidden_size(self):
-        if hasattr(self.hf_config, "hidden_size"):
-            return self.hf_config.hidden_size
-        text_config = self.hf_config.get_text_config()
-        return text_config.hidden_size
-
     @property
     def embedding_size(self):
         dense_modules = try_get_dense_modules(self.model, revision=self.revision)
         if dense_modules is not None:
             return dense_modules[-1]["out_features"]
-        return self.hidden_size
+        return self.get_hidden_size()
 
     def get_and_verify_max_len(self, max_model_len: int):
         # Consider max_model_len in tokenizer_config only when
@@ -1736,6 +1750,119 @@ def get_and_verify_max_len(self, max_model_len: int):
         logger.info("Using max model len %s", max_model_len)
         return max_model_len
 
+    @property
+    def attn_type(self) -> AttnTypeStr:
+        if self.pooler_config is not None:
+            pooling_type = self._model_info.default_pooling_type.lower()
+            if pooling_type == "cls":
+                return "encoder_only"
+            else:
+                is_causal = getattr(self.hf_config, "is_causal", True)
+                return "encoder_only" if not is_causal else self._model_info.attn_type
+        elif self.is_hybrid:
+            return "hybrid"
+        elif self.is_attention_free:
+            return "attention_free"
+        elif self.is_encoder_decoder:
+            return "encoder_decoder"
+        else:
+            return "decoder"
+
+    @property
+    def is_chunked_prefill_supported(self) -> bool:
+        attn_type = self.attn_type
+        if self.pooler_config is not None:
+            # for pooling models
+            if attn_type == "encoder_only":
+                logger.debug(
+                    "Pooling models with bidirectional attn does not support "
+                    "chunked prefill."
+                )
+                return False
+            elif attn_type == "decoder":
+                pooling_type = self.pooler_config.pooling_type.lower()
+                if pooling_type in ["all", "mean", "step", "cls"]:
+                    logger.debug(
+                        "Pooling models with %s pooling does not "
+                        "support chunked prefill.",
+                        pooling_type,
+                    )
+                    return False
+                else:
+                    # pooling_type == "last"
+                    logger.debug(
+                        "Pooling models with causal attn and last pooling support "
+                        "chunked prefill."
+                    )
+                    return True
+            # vllm currently does not have pooling models using hybrid,
+            # attention_free or encoder_decoder attn types.
+            return attn_type != "encoder_decoder"
+        else:
+            if attn_type == "encoder_decoder":
+                logger.debug("Encoder decoder models does not support chunked prefill.")
+                return False
+            logger.debug("Generative models support chunked prefill.")
+            return True
+
+    @property
+    def is_prefix_caching_supported(self) -> bool:
+        attn_type = self.attn_type
+        if self.pooler_config is not None:
+            # for pooling models
+            if attn_type == "encoder_only":
+                logger.debug(
+                    "Pooling models with bidirectional attn does not "
+                    "support prefix caching."
+                )
+                return False
+            elif attn_type == "decoder":
+                pooling_type = self.pooler_config.pooling_type.lower()
+                if pooling_type in ["all", "mean", "step", "cls"]:
+                    logger.debug(
+                        "Pooling models with %s pooling does not "
+                        "support prefix caching.",
+                        pooling_type,
+                    )
+                    return False
+                else:
+                    # pooling_type == "last"
+                    logger.debug(
+                        "Pooling models with causal attn and last pooling support "
+                        "prefix caching."
+                    )
+                    return True
+            # vllm currently does not have pooling models using hybrid,
+            # attention_free or encoder_decoder attn types.
+            return False
+        else:
+            if attn_type == "hybrid":
+                logger.debug(
+                    "Hybrid models does not support prefix caching since the feature "
+                    "is still experimental."
+                )
+                return False
+            elif attn_type == "attention_free":
+                logger.debug(
+                    "Attention free models does not support prefix caching since the "
+                    "feature is still experimental."
+                )
+                return False
+            elif attn_type == "encoder_decoder":
+                logger.debug("Encoder decoder models does not support prefix caching.")
+                return False
+            else:  # attn_type == "decoder"
+                logger.debug("Generative models support prefix caching.")
+                return True
+
+    def is_model_moe(
+        self,
+    ) -> bool:
+        return self.get_num_experts() > 1
+
+    def is_quantized(self) -> bool:
+        return getattr(self.hf_config, "quantization_config", None) is not None
+
 
 def get_served_model_name(model: str, served_model_name: str | list[str] | None):
     """
@@ -2068,31 +2195,32 @@ def _get_and_verify_max_len(
         )
         derived_max_model_len = default_max_len
 
-    rope_scaling = getattr(hf_config, "rope_scaling", None)
+    # In Transformers v5 rope_parameters could be TypedDict or dict[str, TypedDict].
+    # To simplify the verification, we convert it to dict[str, TypedDict].
+    rope_parameters = getattr(hf_config, "rope_parameters", None)
+    if rope_parameters and not set(rope_parameters.keys()).issubset(
+        ALLOWED_LAYER_TYPES
+    ):
+        rope_parameters = {"": rope_parameters}
+
     # NOTE(woosuk): Gemma3's max_model_len (128K) is already scaled by RoPE
     # scaling, so we skip applying the scaling factor again.
-    if rope_scaling is not None and "gemma3" not in hf_config.model_type:
-        # No need to consider "type" key because of patch_rope_scaling when
-        # loading HF config
-        rope_type = rope_scaling["rope_type"]
-
-        if rope_type not in ("su", "longrope", "llama3"):
-            if disable_sliding_window:
-                # TODO(robertgshaw): Find a model that supports rope_scaling
-                # with sliding window to see if this case should be allowed.
-                raise NotImplementedError(
-                    "Disabling sliding window is not supported for models "
-                    "with rope_scaling. Please raise an issue so we can "
-                    "investigate."
-                )
-
-            # NOTE: rope_type == "default" does not define factor
-            # https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
-            scaling_factor = rope_scaling.get("factor", 1.0)
-
-            if rope_type == "yarn":
-                derived_max_model_len = rope_scaling["original_max_position_embeddings"]
-            derived_max_model_len *= scaling_factor
+    if rope_parameters is not None and "gemma3" not in hf_config.model_type:
+        scaling_factor = 1.0
+        for rp in rope_parameters.values():
+            # No need to consider "type" key because of patch_rope_parameters when
+            # loading HF config
+            rope_type = rp["rope_type"]
+
+            if rope_type not in ("su", "longrope", "llama3"):
+                # NOTE: rope_type == "default" does not define factor https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/modeling_rope_utils.py
+                # NOTE: This assumes all layer types have the same scaling factor.
+                scaling_factor = rp.get("factor", scaling_factor)
+
+                if rope_type == "yarn":
+                    derived_max_model_len = rp["original_max_position_embeddings"]
+        # Do this outside loop since all layer types should have the same scaling
+        derived_max_model_len *= scaling_factor
 
     if encoder_config and "max_seq_length" in encoder_config:
         derived_max_model_len = encoder_config["max_seq_length"]
@@ -2102,7 +2230,9 @@ def _get_and_verify_max_len(
     if max_model_len is None:
         # For LongRoPE, default to original_max_position_embeddings to avoid
         # performance degradation for shorter sequences
-        if rope_scaling is not None and rope_scaling["rope_type"] == "longrope":
+        if rope_parameters is not None and any(
+            rp["rope_type"] == "longrope" for rp in rope_parameters.values()
+        ):
             max_model_len = int(
                 getattr(
                     hf_config, "original_max_position_embeddings", derived_max_model_len
@@ -2119,16 +2249,7 @@ def _get_and_verify_max_len(
         # that will be bigger than derived_max_model_len. We compare user input
         # with model_max_length and allow this override when it's smaller.
         model_max_length = getattr(hf_config, "model_max_length", None)
-        if model_max_length is not None and max_model_len <= model_max_length:
-            if disable_sliding_window:
-                # TODO(robertgshaw): Find a model that has model_max_length
-                # with sliding window to see if this case should be allowed.
-                raise NotImplementedError(
-                    "Disabling sliding window is not supported for models "
-                    "model_max_length in the config. Please raise an issue "
-                    "so we can investigate."
-                )
-        else:
+        if model_max_length is None or max_model_len > model_max_length:
             msg = (
                 f"User-specified max_model_len ({max_model_len}) is greater "
                 f"than the derived max_model_len ({max_len_key}="
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index 9f62b35ed515..8a2936de96d6 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -1,19 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Any, Literal, TypeAlias
+from typing import Any, Literal, TypeAlias
 
 from pydantic import ConfigDict, Field, field_validator, model_validator
 from pydantic.dataclasses import dataclass
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config.utils import config
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
-else:
-    AttentionBackendEnum = Any
+from vllm.utils.hashing import safe_hash
 
 
 @dataclass
@@ -170,8 +166,11 @@ def _validate_limit_per_prompt(
     def _validate_mm_encoder_attn_backend(
         cls, value: str | AttentionBackendEnum | None
     ) -> AttentionBackendEnum | None:
-        # We need to import the real type here (deferred to avoid circular import).
-        from vllm.attention.backends.registry import AttentionBackendEnum
+        if isinstance(value, str) and value.upper() == "XFORMERS":
+            raise ValueError(
+                "Attention backend 'XFORMERS' has been removed (See PR #29262 for "
+                "details). Please select a supported attention backend."
+            )
 
         if value is None or isinstance(value, AttentionBackendEnum):
             return value
@@ -210,7 +209,7 @@ def compute_hash(self) -> str:
             if self.mm_encoder_attn_backend is not None
             else None
         ]
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     def get_limit_per_prompt(self, modality: str) -> int:
diff --git a/vllm/config/observability.py b/vllm/config/observability.py
index 564c4f7aed41..ff35e12fe20e 100644
--- a/vllm/config/observability.py
+++ b/vllm/config/observability.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from functools import cached_property
 from typing import Any, Literal, cast
 
@@ -11,6 +10,7 @@
 
 from vllm import version
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 DetailedTraceModules = Literal["model", "worker", "all"]
 
@@ -78,7 +78,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @field_validator("show_hidden_metrics_for_version")
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
index 61bcd15e06a8..4a8c8bc17cfc 100644
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 import os
 from typing import TYPE_CHECKING, Any, Literal
 
@@ -61,6 +60,10 @@ class EPLBConfig:
     Log the balancedness each step of expert parallelism.
     This is turned off by default since it will cause communication overhead.
     """
+    use_async: bool = False
+    """
+    Whether to use non-blocking EPLB.
+    """
 
 
 @config
@@ -72,6 +75,8 @@ class ParallelConfig:
     """Number of pipeline parallel groups."""
     tensor_parallel_size: int = 1
     """Number of tensor parallel groups."""
+    prefill_context_parallel_size: int = 1
+    """Number of prefill context parallel groups."""
     data_parallel_size: int = 1
     """Number of data parallel groups. MoE layers will be sharded according to
     the product of the tensor parallel size and data parallel size."""
@@ -136,22 +141,6 @@ class ParallelConfig:
     - "deepep_high_throughput": Use deepep high-throughput kernels
     - "deepep_low_latency": Use deepep low-latency kernels
     - "flashinfer_all2allv": Use flashinfer alltoallv kernels for mnnvl"""
-    num_redundant_experts: int | None = None
-    """`num_redundant_experts` is deprecated and has been replaced with
-    `eplb_config.num_redundant_experts`. This will be removed in v0.12.0.
-    Please use `eplb_config.num_redundant_experts` instead."""
-    eplb_window_size: int | None = None
-    """`eplb_window_size` is deprecated and has been replaced with
-    `eplb_config.window_size`. This will be removed in v0.12.0.
-    Please use `eplb_config.window_size` instead."""
-    eplb_step_interval: int | None = None
-    """`eplb_step_interval` is deprecated and has been replaced with
-    `eplb_config.step_interval`. This will be removed in v0.12.0.
-    Please use `eplb_config.step_interval` instead."""
-    eplb_log_balancedness: bool | None = None
-    """`eplb_log_balancedness` is deprecated and has been replaced with
-    `eplb_config.log_balancedness`. This will be removed in v0.12.0.
-    Please use `eplb_config.log_balancedness` instead."""
 
     max_parallel_loading_workers: int | None = None
     """Maximum number of parallel loading workers when loading model
@@ -210,6 +199,18 @@ class ParallelConfig:
     class is dynamically inherited by the worker class. This is used to inject
     new attributes and methods to the worker class for use in collective_rpc
     calls."""
+    master_addr: str = "127.0.0.1"
+    """distributed master address for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    master_port: int = 29501
+    """distributed master port for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    node_rank: int = 0
+    """distributed node rank for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
+    nnodes: int = 1
+    """num of nodes for multi-node distributed 
+    inference when distributed_executor_backend is mp."""
 
     world_size: int = Field(init=False)
     """world_size is TPxPP, it affects the number of workers we create."""
@@ -228,14 +229,25 @@ class is dynamically inherited by the worker class. This is used to inject
     needs to be divisible by dcp_size."""
 
     dcp_kv_cache_interleave_size: int = 1
-    """Interleave size of kv_cache storage while using dcp or cp > 1,
-    store interleave_size tokens on (d)cp i,
-    then store next interleave_size tokens on (d)cp i+1.
-    Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
-    Interleave_size=block_size: block-level align, first fill the block on first rank,
-    token is stored on rank i+1 block j after rank i block j is full.
-    Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
-    Block_size should be divisible by dcp_kv_cache_interleave_size.
+    """
+    Interleave size of kv_cache storage while using DCP.
+    dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
+    and will be deprecated when PCP is fully supported.
+
+    """
+    cp_kv_cache_interleave_size: int = 1
+    """Interleave size of kv_cache storage while using DCP or PCP.
+    For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
+        and `total_cp_world_size = pcp_world_size * dcp_world_size`.
+    store interleave_size tokens on total_cp_rank i,
+    then store next interleave_size tokens on total_cp_rank i+1.
+    Interleave_size=1: token-level alignment, where token `i` is stored on
+        total_cp_rank `i % total_cp_world_size`.
+    Interleave_size=block_size: block-level alignment, where tokens are
+        first populated to the preceding ranks. Tokens are then stored
+        in (rank i+1, block j) only after (rank i, block j) is fully occupied.
+    Block_size should be greater than or equal to cp_kv_cache_interleave_size.
+    Block_size should be divisible by cp_kv_cache_interleave_size.
     """
 
     _api_process_count: int = Field(default=1, gt=0)
@@ -300,6 +312,11 @@ def _validate_parallel_config(self) -> Self:
                     "num_redundant_experts."
                 )
 
+        if self.prefill_context_parallel_size > 1:
+            raise ValueError(
+                "Prefill context parallelism is not fully supported. "
+                "Please set prefill_context_parallel_size to 1."
+            )
         return self
 
     @property
@@ -387,6 +404,23 @@ def use_sequence_parallel_moe(self) -> bool:
             and self.data_parallel_size > 1
         )
 
+    @property
+    def node_rank_within_dp(self) -> int:
+        return self.node_rank % self.nnodes_within_dp
+
+    @property
+    def nnodes_within_dp(self) -> int:
+        if self.nnodes == 1:
+            return 1
+        data_parallel_node_size = (
+            self.data_parallel_size // self.data_parallel_size_local
+        )
+        return self.nnodes // data_parallel_node_size
+
+    @property
+    def local_world_size(self) -> int:
+        return self.world_size // self.nnodes_within_dp
+
     @staticmethod
     def has_unfinished_dp(dp_group: ProcessGroup, has_unfinished: bool) -> bool:
         tensor = torch.tensor([has_unfinished], dtype=torch.int32, device="cpu")
@@ -419,19 +453,41 @@ def compute_hash(self):
         This hash is also used for DP worker configuration validation
         to prevent hangs from mismatched collective communication patterns.
         """
-        factors: list[Any] = []
-        factors.append(self.pipeline_parallel_size)
-        factors.append(self.tensor_parallel_size)
-        factors.append(self.enable_expert_parallel)
-        factors.append(self.data_parallel_size)
-        factors.append(self.all2all_backend)
-        factors.append(self.enable_eplb)
-        if self.enable_eplb:
-            factors.append(self.eplb_config.log_balancedness)
-            factors.append(self.eplb_config.window_size)
-            factors.append(self.eplb_config.step_interval)
-            factors.append(self.eplb_config.num_redundant_experts)
-        return hashlib.sha256(str(factors).encode()).hexdigest()
+        ignored_factors = {
+            # Derived/runtime topology, networking, or launch details
+            "data_parallel_rank",
+            "data_parallel_rank_local",
+            "data_parallel_backend",
+            "data_parallel_external_lb",
+            "data_parallel_hybrid_lb",
+            "data_parallel_master_ip",
+            "data_parallel_master_port",
+            "_data_parallel_master_port_list",
+            "data_parallel_rpc_port",
+            "rank",
+            "master_addr",
+            "master_port",
+            "node_rank",
+            "nnodes",
+            "max_parallel_loading_workers",
+            "disable_custom_all_reduce",
+            "ray_workers_use_nsight",
+            "ray_runtime_env",
+            "placement_group",
+            "distributed_executor_backend",
+            "worker_cls",
+            "sd_worker_cls",
+            "worker_extension_cls",
+            "_api_process_count",
+            "_api_process_rank",
+        }
+
+        from vllm.config.utils import get_hash_factors, hash_factors
+
+        factors = get_hash_factors(self, ignored_factors)
+        # Explicitly include backend affecting env factor as before
+        factors["VLLM_ALL2ALL_BACKEND"] = str(envs.VLLM_ALL2ALL_BACKEND)
+        return hash_factors(factors)
 
     def __post_init__(self) -> None:
         # Set all2all_backend from env var if not specified, with deprecation warning
@@ -444,42 +500,12 @@ def __post_init__(self) -> None:
                     "--all2all-backend command-line argument instead."
                 )
 
-        # Forward deprecated fields to their new location
-        if self.num_redundant_experts is not None:
-            self.eplb_config.num_redundant_experts = self.num_redundant_experts
-            logger.warning_once(
-                "num_redundant_experts is deprecated and has been replaced "
-                "with eplb_config.num_redundant_experts. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_window_size is not None:
-            self.eplb_config.window_size = self.eplb_window_size
-            logger.warning_once(
-                "eplb_window_size is deprecated and has been replaced "
-                "with eplb_config.window_size. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_step_interval is not None:
-            self.eplb_config.step_interval = self.eplb_step_interval
-            logger.warning_once(
-                "eplb_step_interval is deprecated and has been replaced "
-                "with eplb_config.step_interval. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-        if self.eplb_log_balancedness is not None:
-            self.eplb_config.log_balancedness = self.eplb_log_balancedness
-            logger.warning_once(
-                "eplb_log_balancedness is deprecated and has been replaced "
-                "with eplb_config.log_balancedness. This will be removed "
-                "in v0.12.0. Changing this field after initialization will "
-                "have no effect."
-            )
-
         # Continue with the rest of the initialization
-        self.world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        self.world_size = (
+            self.pipeline_parallel_size
+            * self.tensor_parallel_size
+            * self.prefill_context_parallel_size
+        )
 
         if self.distributed_executor_backend == "external_launcher":
             logger.info("Using external launcher for distributed inference.")
@@ -528,6 +554,8 @@ def __post_init__(self) -> None:
             ray_found = ray_utils.ray_is_available()
             if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
                 backend = "uni"
+            elif current_platform.is_cuda() and self.nnodes > 1:
+                backend = "mp"
             elif (
                 current_platform.is_cuda()
                 and cuda_device_count_stateless() < self.world_size
@@ -565,6 +593,11 @@ def __post_init__(self) -> None:
                 "max_parallel_loading_workers is currently "
                 "not supported and will be ignored."
             )
+        if self.distributed_executor_backend not in ("mp", "uni") and self.nnodes > 1:
+            raise ValueError(
+                "nnodes > 1 can only be set when distributed executor "
+                "backend is mp or uni."
+            )
 
     @property
     def use_ray(self) -> bool:
@@ -607,6 +640,11 @@ def _verify_args(self) -> Self:
                 "Disabled the custom all-reduce kernel because it is not "
                 "supported on current platform."
             )
+        if self.nnodes > 1:
+            self.disable_custom_all_reduce = True
+            logger.debug(
+                "Disabled the custom all-reduce since we are running on multi-node."
+            )
         if self.ray_workers_use_nsight and not self.use_ray:
             raise ValueError(
                 "Unable to use nsight profiling unless workers run with Ray."
diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py
index 6bece8d0785b..aa4e7006d024 100644
--- a/vllm/config/pooler.py
+++ b/vllm/config/pooler.py
@@ -1,23 +1,25 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
-from typing import Any
+from typing import Any, Literal
 
 from pydantic.dataclasses import dataclass
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 
 logger = init_logger(__name__)
 
+PoolingTypeStr = Literal["LAST", "ALL", "CLS", "STEP", "MEAN"]
+
 
 @config
 @dataclass
 class PoolerConfig:
     """Controls the behavior of output pooling in pooling models."""
 
-    pooling_type: str | None = None
+    pooling_type: PoolingTypeStr | None = None
     """
     The pooling method of the pooling model. This should be a key in
     [`vllm.model_executor.layers.pooler.PoolingType`][].
@@ -102,7 +104,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
 
diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py
index 47aa343527b3..ff1ac0e18f32 100644
--- a/vllm/config/scheduler.py
+++ b/vllm/config/scheduler.py
@@ -1,22 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from collections.abc import Callable
 from dataclasses import InitVar
-from typing import TYPE_CHECKING, Any, Literal, cast
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
 
-from pydantic import Field, field_validator, model_validator
+from pydantic import Field, field_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
 from vllm.config.utils import config
 from vllm.logger import init_logger
-from vllm.utils import (
-    DEFAULT_MAX_NUM_BATCHED_TOKENS,
-    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-)
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import resolve_obj_by_qualname
 
 if TYPE_CHECKING:
@@ -33,25 +28,25 @@
 class SchedulerConfig:
     """Scheduler configuration."""
 
+    DEFAULT_MAX_NUM_BATCHED_TOKENS: ClassVar[int] = 2048
+    DEFAULT_MAX_NUM_SEQS: ClassVar[int] = 128
+
     runner_type: RunnerType = "generate"
     """The runner type to launch for the model."""
 
-    max_num_batched_tokens: int = Field(default=None, ge=1)
+    max_num_batched_tokens: int = Field(default=DEFAULT_MAX_NUM_BATCHED_TOKENS, ge=1)
     """Maximum number of tokens to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
-    max_num_seqs: int = Field(default=None, ge=1)
+    max_num_seqs: int = Field(default=DEFAULT_MAX_NUM_SEQS, ge=1)
     """Maximum number of sequences to be processed in a single iteration.
 
-    This config has no static default. If left unspecified by the user, it will
-    be set in `EngineArgs.create_engine_config` based on the usage context."""
-
-    max_model_len: int = Field(default=None, ge=1)
-    """Maximum length of a sequence (including prompt and generated text). This
-    is primarily set in `ModelConfig` and that value should be manually
-    duplicated here."""
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
     max_num_partial_prefills: int = Field(default=1, ge=1)
     """For chunked prefill, the maximum number of sequences that can be
@@ -67,22 +62,23 @@ class SchedulerConfig:
     """For chunked prefill, a request is considered long if the prompt is
     longer than this number of tokens."""
 
-    num_lookahead_slots: int = Field(default=0, ge=0)
-    """The number of slots to allocate per sequence per
-    step, beyond the known token ids. This is used in speculative
-    decoding to store KV activations of tokens which may or may not be
-    accepted.
-
-    NOTE: This will be replaced by speculative config in the future; it is
-    present to enable correctness tests until then."""
-
-    enable_chunked_prefill: bool = Field(default=None)
+    enable_chunked_prefill: bool = True
     """If True, prefill requests can be chunked based
-    on the remaining max_num_batched_tokens."""
+    on the remaining `max_num_batched_tokens`.
+
+    The default value here is mainly for convenience when testing.
+    In real usage, this should be set in `EngineArgs.create_engine_config`.
+    """
 
     is_multimodal_model: bool = False
     """True if the model is multimodal."""
 
+    max_model_len: InitVar[int] = 8192
+    """Maximum length of a sequence (including prompt and generated text).
+
+    Note: This is stored in the ModelConfig, and is used only here to
+    provide fallbacks and validate other attributes."""
+
     is_encoder_decoder: InitVar[bool] = False
     """True if the model is an encoder-decoder model.
 
@@ -111,9 +107,6 @@ class SchedulerConfig:
     - "priority" means requests are handled based on given priority (lower
     value means earlier handling) and time of arrival deciding any ties)."""
 
-    chunked_prefill_enabled: bool = Field(init=False)
-    """True if chunked prefill is enabled."""
-
     disable_chunked_mm_input: bool = False
     """If set to true and chunked prefill is enabled, we do not want to
     partially schedule a multimodal item. Only used in V1
@@ -142,6 +135,12 @@ class SchedulerConfig:
     speculative decoding and pipeline parallelism.
     """
 
+    stream_interval: int = Field(default=1, ge=1)
+    """The interval (or buffer size) for streaming in terms of token length.
+    A smaller value (1) makes streaming smoother by sending each token immediately,
+    while a larger value (e.g., 10) reduces host overhead and may increase throughput
+    by batching multiple tokens before sending."""
+
     def get_scheduler_cls(self) -> type["SchedulerInterface"]:
         if self.scheduler_cls is None:
             if self.async_scheduling:
@@ -179,18 +178,10 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
-    @field_validator(
-        "max_num_batched_tokens",
-        "max_num_seqs",
-        "max_model_len",
-        "enable_chunked_prefill",
-        "scheduler_cls",
-        "async_scheduling",
-        mode="wrap",
-    )
+    @field_validator("scheduler_cls", "async_scheduling", mode="wrap")
     @classmethod
     def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
         """Skip validation if the value is `None` when initialisation is delayed."""
@@ -198,17 +189,10 @@ def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
             return value
         return handler(value)
 
-    def __post_init__(self, is_encoder_decoder: bool) -> None:
-        if self.max_model_len is None:
-            self.max_model_len = 8192
-
-        if self.max_num_seqs is None:
-            self.max_num_seqs = 128
-
+    def __post_init__(self, max_model_len: int, is_encoder_decoder: bool) -> None:
         if is_encoder_decoder:
             # Chunked prefill should be disabled for encoder-decoder models.
             self.disable_chunked_mm_input = True
-            self.chunked_prefill_enabled = False
             self.enable_chunked_prefill = False
             self.long_prefill_token_threshold = 0
             logger.info(
@@ -216,37 +200,6 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 " prefix caching; disabling both."
             )
 
-        if self.max_num_batched_tokens is None:
-            if self.enable_chunked_prefill:
-                self.max_num_batched_tokens = DEFAULT_MAX_NUM_BATCHED_TOKENS
-            else:
-                # If max_model_len is too short, use
-                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
-                # for higher throughput.
-                self.max_num_batched_tokens = max(
-                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS
-                )
-
-            if self.runner_type == "pooling":
-                # Choose specific value for higher throughput
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-            if self.is_multimodal_model:
-                # The value needs to be at least the number of multimodal tokens
-                self.max_num_batched_tokens = max(
-                    self.max_num_batched_tokens,
-                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
-                )
-
-            # When using default settings,
-            # Ensure max_num_batched_tokens does not exceed model limit.
-            # Some models (e.g., Whisper) have embeddings tied to max length.
-            self.max_num_batched_tokens = min(
-                self.max_num_seqs * self.max_model_len, self.max_num_batched_tokens
-            )
-
         self.max_num_encoder_input_tokens = self.max_num_batched_tokens
         self.encoder_cache_size = self.max_num_batched_tokens
 
@@ -256,10 +209,9 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.max_num_batched_tokens,
             )
 
-        self.chunked_prefill_enabled = self.enable_chunked_prefill
         if self.max_num_partial_prefills > 1:
             if self.long_prefill_token_threshold == 0:
-                self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
+                self.long_prefill_token_threshold = int(max_model_len * 0.04)
 
             logger.info(
                 "Concurrent partial prefills enabled with "
@@ -270,15 +222,16 @@ def __post_init__(self, is_encoder_decoder: bool) -> None:
                 self.long_prefill_token_threshold,
             )
 
-    @model_validator(mode="after")
-    def _verify_args(self) -> Self:
+        self.verify_max_model_len(max_model_len)
+
+    def verify_max_model_len(self, max_model_len: int) -> Self:
         if (
-            self.max_num_batched_tokens < self.max_model_len
-            and not self.chunked_prefill_enabled
+            self.max_num_batched_tokens < max_model_len
+            and not self.enable_chunked_prefill
         ):
             raise ValueError(
                 f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
-                f"smaller than max_model_len ({self.max_model_len}). "
+                f"smaller than max_model_len ({max_model_len}). "
                 "This effectively limits the maximum sequence length to "
                 "max_num_batched_tokens and makes vLLM reject longer "
                 "sequences. Please increase max_num_batched_tokens or "
@@ -292,26 +245,26 @@ def _verify_args(self) -> Self:
                 f"({self.max_num_seqs})."
             )
 
-        if self.max_num_batched_tokens > self.max_num_seqs * self.max_model_len:
+        if self.max_num_batched_tokens > self.max_num_seqs * max_model_len:
             logger.warning(
                 "max_num_batched_tokens (%d) exceeds max_num_seqs "
                 "* max_model_len (%d). This may lead to unexpected behavior.",
                 self.max_num_batched_tokens,
-                self.max_num_seqs * self.max_model_len,
+                self.max_num_seqs * max_model_len,
             )
 
         if self.max_num_partial_prefills > 1:
-            if not self.chunked_prefill_enabled:
+            if not self.enable_chunked_prefill:
                 raise ValueError(
                     "Chunked prefill must be enabled to set "
                     "max_num_partial_prefills > 1."
                 )
 
-            if self.long_prefill_token_threshold > self.max_model_len:
+            if self.long_prefill_token_threshold > max_model_len:
                 raise ValueError(
                     "long_prefill_token_threshold "
                     f"({self.long_prefill_token_threshold}) cannot be greater "
-                    f"than the max_model_len ({self.max_model_len})."
+                    f"than the max_model_len ({max_model_len})."
                 )
 
         if self.max_long_partial_prefills > self.max_num_partial_prefills:
diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py
index 31cdeabe501d..80d53a543f14 100644
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -2,26 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import ast
-import hashlib
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, Literal, get_args
 
 from pydantic import Field, SkipValidation, model_validator
 from pydantic.dataclasses import dataclass
 from typing_extensions import Self
 
+from vllm.config.model import ModelConfig
 from vllm.config.parallel import ParallelConfig
 from vllm.config.utils import config
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.utils.import_utils import LazyLoader, has_arctic_inference
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
 
     import vllm.model_executor.layers.quantization as me_quant
-    from vllm.config import ModelConfig
 else:
     PretrainedConfig = Any
-    ModelConfig = Any
 
     me_quant = LazyLoader(
         "model_executor", globals(), "vllm.model_executor.layers.quantization"
@@ -29,31 +28,25 @@
 
 logger = init_logger(__name__)
 
-SpeculativeMethod = Literal[
-    "ngram",
-    "eagle",
-    "eagle3",
-    "medusa",
-    "mlp_speculator",
-    "draft_model",
-    "deepseek_mtp",
-    "ernie_mtp",
-    "qwen3_next_mtp",
-    "mimo_mtp",
-    "longcat_flash_mtp",
-    "pangu_ultra_moe_mtp",
-    "mtp",
-    "suffix",
-]
-MTP_MODEL_TYPES = (
+MTPModelTypes = Literal[
     "deepseek_mtp",
     "mimo_mtp",
     "glm4_moe_mtp",
     "ernie_mtp",
     "qwen3_next_mtp",
     "longcat_flash_mtp",
+    "mtp",
     "pangu_ultra_moe_mtp",
-)
+]
+EagleModelTypes = Literal["eagle", "eagle3", MTPModelTypes]
+SpeculativeMethod = Literal[
+    "ngram",
+    "medusa",
+    "mlp_speculator",
+    "draft_model",
+    "suffix",
+    EagleModelTypes,
+]
 
 
 @config
@@ -169,7 +162,7 @@ def compute_hash(self) -> str:
         # Eagle3 affects the computation graph because it returns intermediate
         # hidden states in addition to the final hidden state.
         factors.append(self.method == "eagle3")
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @staticmethod
@@ -244,7 +237,7 @@ def __post_init__(self):
         # can not be detected, it will be considered as the "draft_model" by
         # default.
 
-        if self.method in MTP_MODEL_TYPES:
+        if self.method in get_args(MTPModelTypes) and self.method != "mtp":
             logger.warning(
                 "method `%s` is deprecated and replaced with mtp.", self.method
             )
@@ -322,10 +315,6 @@ def __post_init__(self):
             self.prompt_lookup_min = 0
 
             if self.model is not None:
-                # TODO: Move this import to the top once `ModelConfig`
-                # lives in `vllm.config.model`.
-                from vllm.config import ModelConfig
-
                 self.draft_model_config = ModelConfig(
                     model=self.model,
                     runner="draft",
@@ -361,7 +350,9 @@ def __post_init__(self):
                     self.method = "medusa"
                 elif self.draft_model_config.hf_config.model_type == "mlp_speculator":
                     self.method = "mlp_speculator"
-                elif self.draft_model_config.hf_config.model_type in MTP_MODEL_TYPES:
+                elif self.draft_model_config.hf_config.model_type in get_args(
+                    MTPModelTypes
+                ):
                     self.method = "mtp"
                     if self.num_speculative_tokens > 1:
                         logger.warning(
@@ -638,16 +629,6 @@ def _verify_args(self) -> Self:
 
         return self
 
-    @property
-    def num_lookahead_slots(self) -> int:
-        """The number of additional slots the scheduler should allocate per
-        step, in addition to the slots allocated for each known token.
-
-        This is equal to the number of speculative tokens, as each speculative
-        token must be scored.
-        """
-        return self.num_speculative_tokens
-
     def use_eagle(self) -> bool:
         return self.method in ("eagle", "eagle3", "mtp")
 
diff --git a/vllm/config/structured_outputs.py b/vllm/config/structured_outputs.py
index 9530d3d81e15..1b32675c3dbd 100644
--- a/vllm/config/structured_outputs.py
+++ b/vllm/config/structured_outputs.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import hashlib
 from typing import Any, Literal
 
 from pydantic import model_validator
@@ -9,6 +8,7 @@
 from typing_extensions import Self
 
 from vllm.config.utils import config
+from vllm.utils.hashing import safe_hash
 
 StructuredOutputsBackend = Literal[
     "auto", "xgrammar", "guidance", "outlines", "lm-format-enforcer"
@@ -58,7 +58,7 @@ def compute_hash(self) -> str:
         # no factors to consider.
         # this config will not affect the computation graph.
         factors: list[Any] = []
-        hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()
         return hash_str
 
     @model_validator(mode="after")
diff --git a/vllm/config/utils.py b/vllm/config/utils.py
index 7e0878d96bbd..02f2b75f608f 100644
--- a/vllm/config/utils.py
+++ b/vllm/config/utils.py
@@ -3,14 +3,19 @@
 """Utility functions for vLLM config dataclasses."""
 
 import ast
+import enum
+import hashlib
 import inspect
+import json
+import pathlib
 import textwrap
-from collections.abc import Iterable
+from collections.abc import Iterable, Mapping, Sequence, Set
 from dataclasses import MISSING, Field, field, fields, is_dataclass, replace
 from itertools import pairwise
 from typing import TYPE_CHECKING, Any, Protocol, TypeVar
 
 import regex as re
+import torch
 from pydantic.fields import FieldInfo
 from typing_extensions import runtime_checkable
 
@@ -176,3 +181,115 @@ def update_config(config: ConfigT, overrides: dict[str, Any]) -> ConfigT:
             )
         processed_overrides[field_name] = value
     return replace(config, **processed_overrides)
+
+
+def normalize_value(x):
+    """Return a stable, JSON-serializable canonical form for hashing.
+    Order: primitives, special types (Enum, callable, torch.dtype, Path), then
+    generic containers (Mapping/Set/Sequence) with recursion.
+    """
+    # Fast path
+    if x is None or isinstance(x, (bool, int, float, str)):
+        return x
+
+    # Enums: tag with FQN to avoid primitive collisions.
+    # Ex: Enum(1) vs int(1) -> ("module.QualName", value).
+    if isinstance(x, enum.Enum):
+        enum_type = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
+        return (enum_type, normalize_value(x.value))
+
+    # Classes (types) are accepted and canonicalized by their fully-qualified
+    # name (module.qualname) for a stable identifier.
+    # Instances are only accepted if they expose uuid(); otherwise they are
+    # rejected to avoid under-hashing object state.
+
+    # Callables: accept classes only; reject funcs/lambdas/methods.
+    # Used by LogitsProcessor types and ModelConfig.hf_overrides.
+    if isinstance(x, type):
+        module = getattr(x, "__module__", "")
+        qual = getattr(x, "__qualname__", getattr(x, "__name__", ""))
+        return ".".join([p for p in (module, qual) if p]) or repr(x)
+
+    # Prefer stable uuid identifiers for objects that provide them, even if
+    # they are callable instances (e.g., InductorPass wrappers).
+    if hasattr(x, "uuid") and callable(getattr(x, "uuid", None)):
+        return x.uuid()
+
+    if callable(x):
+        raise TypeError("normalize_value: function or callable instance unsupported")
+
+    # Torch dtype: stringify (torch.float64 -> "torch.float64").
+    # We rely on the string form here; dtype-bearing fields that need additional
+    # disambiguation should encode that at the config layer.
+    if isinstance(x, torch.dtype):
+        return str(x)
+
+    # Bytes
+    if isinstance(x, (bytes, bytearray)):
+        return x.hex()
+
+    # Paths (canonicalize)
+    if isinstance(x, pathlib.Path):
+        try:
+            return str(x.expanduser().resolve())
+        except Exception:
+            return str(x)
+
+    # Dataclasses: represent as (FQN, sorted(field,value) tuple) for stability.
+    if is_dataclass(x):
+        type_fqn = f"{x.__class__.__module__}.{x.__class__.__qualname__}"
+        items = tuple(
+            (f.name, normalize_value(getattr(x, f.name)))
+            for f in sorted(fields(x), key=lambda f: f.name)
+        )
+        return (type_fqn, items)
+
+    # Containers (generic)
+    if isinstance(x, Mapping):
+        return tuple(sorted((str(k), normalize_value(v)) for k, v in x.items()))
+    if isinstance(x, Set):
+        return tuple(sorted(repr(normalize_value(v)) for v in x))
+    if isinstance(x, Sequence) and not isinstance(x, (str, bytes, bytearray)):
+        return tuple(normalize_value(v) for v in x)
+
+    # PretrainedConfig
+    if hasattr(x, "to_json_string") and callable(x.to_json_string):
+        return x.to_json_string()
+
+    # Unsupported type: e.g., modules, generators, open files, or objects
+    # without a stable JSON/UUID representation. Hard-error to avoid
+    # under-hashing.
+    # If you hit this, either reshape your config to use supported primitives
+    # and containers, or extend normalize_value to provide a stable encoding
+    # (e.g., via uuid() or to_json_string()) for this type.
+    raise TypeError(
+        f"normalize_value: unsupported type '{type(x).__name__}'. "
+        "Ensure config values use supported primitives/containers or add a "
+        "stable representation for this type."
+    )
+
+
+def get_hash_factors(config: ConfigT, ignored_factors: set[str]) -> dict[str, object]:
+    """Gets the factors used for hashing a config class.
+    - Includes all dataclass fields not in `ignored_factors`.
+    - Errors on non-normalizable values.
+    """
+    factors: dict[str, object] = {}
+    for dc_field in fields(config):
+        factor = dc_field.name
+        if factor in ignored_factors:
+            continue
+        value = getattr(config, factor, None)
+        try:
+            factors[factor] = normalize_value(value)
+        except TypeError as e:
+            raise TypeError(
+                f"get_hash_factors: unsupported type for key '{factor}' "
+                f"({type(value).__name__})"
+            ) from e
+    return factors
+
+
+def hash_factors(items: dict[str, object]) -> str:
+    """Return a SHA-256 hex digest of the canonical items structure."""
+    return hashlib.sha256(json.dumps(items, sort_keys=True).encode()).hexdigest()
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index f581267f73f7..34e70e3e134b 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -3,27 +3,29 @@
 
 import copy
 import getpass
-import hashlib
 import json
 import os
 import tempfile
 import threading
 import time
 from contextlib import contextmanager
-from dataclasses import replace
+from dataclasses import is_dataclass, replace
 from datetime import datetime
+from enum import IntEnum
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar, get_args
 
 import torch
 from pydantic import ConfigDict, Field, model_validator
 from pydantic.dataclasses import dataclass
 
 import vllm.envs as envs
+from vllm.config.speculative import EagleModelTypes
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.transformers_utils.runai_utils import is_runai_obj_uri
 from vllm.utils import random_uuid
+from vllm.utils.hashing import safe_hash
 
 from .cache import CacheConfig
 from .compilation import CompilationConfig, CompilationMode, CUDAGraphMode
@@ -56,6 +58,103 @@
 logger = init_logger(__name__)
 
 
+class OptimizationLevel(IntEnum):
+    """Optimization level enum."""
+
+    O0 = 0
+    """O0 : No optimization. no compilation, no cudagraphs, no other
+    optimization, just starting up immediately"""
+    O1 = 1
+    """O1: Quick optimizations. Dynamo+Inductor compilation and Piecewise 
+    cudagraphs"""
+    O2 = 2
+    """O2: Full optimizations. -O1 as well as Full and Piecewise cudagraphs."""
+    O3 = 3
+    """O3: Currently the same as -O2s."""
+
+
+IS_QUANTIZED = False
+IS_DENSE = False
+# The optimizations that depend on these properties currently set to False
+# in all cases.
+# if model_config is not None:
+#     IS_QUANTIZED = lambda c: c.model_config.is_quantized()
+#     IS_DENSE = lambda c: not c.model_config.is_model_moe()
+# See https://github.com/vllm-project/vllm/issues/25689.
+
+
+def enable_fusion(cfg: "VllmConfig") -> bool:
+    """Returns True if RMS norm or quant FP8 is enabled."""
+    return cfg.compilation_config.is_custom_op_enabled(
+        "rms_norm"
+    ) or cfg.compilation_config.is_custom_op_enabled("quant_fp8")
+
+
+OPTIMIZATION_LEVEL_00 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": False,
+            "enable_fusion": False,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.NONE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_01 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": False,
+            "enable_sequence_parallelism": False,
+            "enable_async_tp": False,
+        },
+        "cudagraph_mode": CUDAGraphMode.PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_02 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+OPTIMIZATION_LEVEL_03 = {
+    "compilation_config": {
+        "pass_config": {
+            "enable_noop": True,
+            "enable_fusion": enable_fusion,
+            "enable_fi_allreduce_fusion": False,
+            "enable_attn_fusion": IS_QUANTIZED,
+            "enable_sequence_parallelism": IS_DENSE,
+            "enable_async_tp": IS_DENSE,
+        },
+        "cudagraph_mode": CUDAGraphMode.FULL_AND_PIECEWISE,
+        "use_inductor_graph_partition": False,
+    },
+}
+
+OPTIMIZATION_LEVEL_TO_CONFIG = {
+    OptimizationLevel.O0: OPTIMIZATION_LEVEL_00,
+    OptimizationLevel.O1: OPTIMIZATION_LEVEL_01,
+    OptimizationLevel.O2: OPTIMIZATION_LEVEL_02,
+    OptimizationLevel.O3: OPTIMIZATION_LEVEL_03,
+}
+
+
 @config
 @dataclass(config=ConfigDict(arbitrary_types_allowed=True))
 class VllmConfig:
@@ -94,8 +193,8 @@ class VllmConfig:
     compilation_config: CompilationConfig = Field(default_factory=CompilationConfig)
     """`torch.compile` and cudagraph capture configuration for the model.
 
-    As a shorthand, one can append compilation arguments via 
-    -0.parameter=arguement such as `-O.mode=3` (same as `-O='{"mode":3}'`).
+    As a shorthand, one can append compilation arguments via
+    -cc.parameter=argument such as `-cc.mode=3` (same as `-cc='{"mode":3}'`).
 
     You can specify the full compilation config like so:
     `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}`
@@ -115,6 +214,11 @@ class VllmConfig:
     you are using. Contents must be hashable."""
     instance_id: str = ""
     """The ID of the vLLM instance."""
+    optimization_level: OptimizationLevel = OptimizationLevel.O2
+    """The optimization level. These levels trade startup time cost for
+    performance, with -O0 having the best startup time and -O3 having the best
+    performance. -02 is used by defult. See  OptimizationLevel for full
+    description."""
 
     def compute_hash(self) -> str:
         """
@@ -192,7 +296,7 @@ def compute_hash(self) -> str:
             vllm_factors.append("None")
         if self.additional_config:
             if isinstance(additional_config := self.additional_config, dict):
-                additional_config_hash = hashlib.md5(
+                additional_config_hash = safe_hash(
                     json.dumps(additional_config, sort_keys=True).encode(),
                     usedforsecurity=False,
                 ).hexdigest()
@@ -203,9 +307,9 @@ def compute_hash(self) -> str:
             vllm_factors.append("None")
         factors.append(vllm_factors)
 
-        hash_str = hashlib.md5(
-            str(factors).encode(), usedforsecurity=False
-        ).hexdigest()[:10]
+        hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest()[
+            :10
+        ]
         return hash_str
 
     def pad_for_cudagraph(self, batch_size: int) -> int:
@@ -296,6 +400,50 @@ def with_hf_config(
 
         return replace(self, model_config=model_config)
 
+    def _set_config_default(self, config_obj: Any, key: str, value: Any) -> None:
+        """Set config attribute to default if not already set by user.
+
+        Args:
+            config_obj: Configuration object to update.
+            key: Attribute name.
+            value: Default value (static or callable).
+        """
+        if getattr(config_obj, key) is None:
+            # Some config values are known before initialization and are
+            # hard coded.
+            # Other values depend on the user given configuration, so they are
+            # implemented with lambda functions and decided at run time.
+            setattr(config_obj, key, value(self) if callable(value) else value)
+
+    def _apply_optimization_level_defaults(self, defaults: dict[str, Any]) -> None:
+        """Apply optimization level defaults using self as root.
+
+        Recursively applies values from defaults into nested config objects.
+        Only fields present in defaults are overwritten.
+
+        If the user configuration does not specify a value for a default field
+        and if the default field is still None after all user selections are
+        applied, then default values will be applied to the field. User speciied
+        fields will not be overridden by the default.
+
+        Args:
+            defaults: Dictionary of default values to apply.
+        """
+
+        def apply_recursive(config_obj: Any, config_defaults: dict[str, Any]) -> None:
+            """Recursively apply defaults to config_obj, using self as root."""
+            for key, value in config_defaults.items():
+                if not hasattr(config_obj, key):
+                    continue
+
+                current = getattr(config_obj, key)
+                if isinstance(value, dict) and is_dataclass(current):
+                    apply_recursive(current, value)
+                else:
+                    self._set_config_default(config_obj, key, value)
+
+        apply_recursive(self, defaults)
+
     def _post_init_kv_transfer_config(self) -> None:
         """Update KVTransferConfig based on top-level configs in VllmConfig.
 
@@ -374,10 +522,22 @@ def __post_init__(self):
                     "Async scheduling is not yet compatible with "
                     "pipeline_parallel_size > 1."
                 )
+            # Currently, async scheduling only support eagle speculative
+            # decoding.
             if self.speculative_config is not None:
-                raise ValueError(
-                    "Async scheduling is not yet compatible with speculative decoding."
-                )
+                if self.speculative_config.method not in get_args(EagleModelTypes):
+                    raise ValueError(
+                        "Currently, async scheduling is only supported "
+                        "with EAGLE/MTP kind of speculative decoding"
+                    )
+                if self.speculative_config.disable_padded_drafter_batch:
+                    raise ValueError(
+                        "async scheduling for EAGLE/MTP kind of speculative "
+                        "decoding is enabled, but disable_padded_drafter_batch=True "
+                        "disable_padded_drafter_batch=True is not supported for "
+                        "this situation now. please set "
+                        "disable_padded_drafter_batch=Fasle"
+                    )
             if not executor_supports_async_sched:
                 raise ValueError(
                     "Currently, async scheduling only supports `mp`, `uni`, or "
@@ -411,7 +571,7 @@ def __post_init__(self):
 
         if (
             self.model_config is not None
-            and self.scheduler_config.chunked_prefill_enabled
+            and self.scheduler_config.enable_chunked_prefill
             and self.model_config.dtype == torch.float32
             and current_platform.get_device_capability() == (7, 5)
         ):
@@ -421,17 +581,47 @@ def __post_init__(self):
                 "precision for chunked prefill triton kernels."
             )
 
-        # If the user does not explicitly set a compilation mode, then
-        # we use the default mode. The default mode depends on other
-        # settings (see the below code).
+        if (
+            self.optimization_level > OptimizationLevel.O0
+            and self.model_config is not None
+            and self.model_config.enforce_eager
+        ):
+            logger.warning("Enforce eager set, overriding optimization level to -O0")
+            self.optimization_level = OptimizationLevel.O0
+
+        if self.compilation_config.backend == "eager" or (
+            self.compilation_config.mode is not None
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.warning(
+                "Inductor compilation was disabled by user settings,"
+                "Optimizations settings that are only active during"
+                "Inductor compilation will be ignored."
+            )
+
+        def has_blocked_weights():
+            if self.quant_config is not None:
+                if hasattr(self.quant_config, "weight_block_size"):
+                    return self.quant_config.weight_block_size is not None
+                elif hasattr(self.quant_config, "has_blocked_weights"):
+                    return self.quant_config.has_blocked_weights()
+            return False
+
+        # Enable quant_fp8 CUDA ops (TODO disable in follow up)
+        # On H100 the CUDA kernel is faster than
+        # native implementation
+        # https://github.com/vllm-project/vllm/issues/25094
+        if has_blocked_weights():
+            custom_ops = self.compilation_config.custom_ops
+            if "-quant_fp8" not in custom_ops:
+                custom_ops.append("+quant_fp8")
+
         if self.compilation_config.mode is None:
-            if self.model_config is not None and not self.model_config.enforce_eager:
+            if self.optimization_level > OptimizationLevel.O0:
                 self.compilation_config.mode = CompilationMode.VLLM_COMPILE
             else:
                 self.compilation_config.mode = CompilationMode.NONE
 
-        # If user does not set custom ops via none or all set it here based on
-        # compilation mode and backend.
         if all(s not in self.compilation_config.custom_ops for s in ("all", "none")):
             if (
                 self.compilation_config.backend == "inductor"
@@ -441,25 +631,33 @@ def __post_init__(self):
             else:
                 self.compilation_config.custom_ops.append("all")
 
+        default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level]
+        self._apply_optimization_level_defaults(default_config)
+        if (
+            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            and self.compilation_config.mode != CompilationMode.VLLM_COMPILE
+        ):
+            logger.info(
+                "Cudagraph mode %s is not compatible with compilation mode %s."
+                "Overriding to NONE.",
+                self.compilation_config.cudagraph_mode,
+                self.compilation_config.mode,
+            )
+            self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+
         # async tp is built on top of sequence parallelism
         # and requires it to be enabled.
         if self.compilation_config.pass_config.enable_async_tp:
             self.compilation_config.pass_config.enable_sequence_parallelism = True
         if self.compilation_config.pass_config.enable_sequence_parallelism:
-            self.compilation_config.custom_ops.append("+rms_norm")
+            if "-rms_norm" in self.compilation_config.custom_ops:
+                logger.warning(
+                    "RMS norm force disabled, sequence parallelism might break"
+                )
+            else:
+                self.compilation_config.custom_ops.append("+rms_norm")
 
         if current_platform.support_static_graph_mode():
-            # if cudagraph_mode is not explicitly set by users, set default
-            # value
-            if self.compilation_config.cudagraph_mode is None:
-                if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
-                    # default to full and piecewise for most models
-                    self.compilation_config.cudagraph_mode = (
-                        CUDAGraphMode.FULL_AND_PIECEWISE
-                    )
-                else:
-                    self.compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
             # if cudagraph_mode has full cudagraphs, we need to check support
             if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
                 # decode context parallel does not support full cudagraphs
@@ -470,6 +668,14 @@ def __post_init__(self):
                         "Overriding cudagraph_mode to PIECEWISE."
                     )
                     self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
+                # prefill context parallel do not support full cudagraphs
+                elif self.parallel_config.prefill_context_parallel_size > 1:
+                    logger.warning_once(
+                        "Prefill context parallel (PCP) is enabled, which is "
+                        "incompatible with full CUDA graphs. "
+                        "Overriding cudagraph_mode to PIECEWISE."
+                    )
+                    self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                 elif self.model_config is not None:
                     if self.model_config.pooler_config is not None:
                         logger.warning_once(
@@ -483,21 +689,6 @@ def __post_init__(self):
                             "Overriding cudagraph_mode to PIECEWISE."
                         )
                         self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
-                    elif (
-                        current_platform.is_cuda()
-                        and current_platform.is_device_capability(100)
-                        and self.model_config.max_model_len > 131072
-                        and not self.model_config.use_mla
-                    ):
-                        # Refer to vllm/utils/flashinfer.py::use_trtllm_attention()
-                        logger.warning_once(
-                            "NVIDIA Blackwell TRTLLM attention cannot support "
-                            "max_model_len >= 131072 (found "
-                            f"{self.model_config.max_model_len}), causing dynamic "
-                            "dispatching that breaks full cudagraphs. "
-                            "Overriding cudagraph_mode to PIECEWISE."
-                        )
-                        self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
 
             # disable cudagraph when enforce eager execution
             if self.model_config is not None and self.model_config.enforce_eager:
@@ -530,65 +721,27 @@ def __post_init__(self):
                 "correctness and to realize prefill savings. "
             )
 
-        disable_chunked_prefill_reasons: list[str] = []
-
-        if self.model_config:
-            if self.model_config.pooler_config:
-                pooling_type = self.model_config.pooler_config.pooling_type
-                if pooling_type is None or pooling_type.lower() != "last":
-                    disable_chunked_prefill_reasons.append(
-                        'Only "last" pooling supports chunked '
-                        "prefill and prefix caching; disabling both."
-                    )
-                if not getattr(self.model_config.hf_config, "is_causal", True):
-                    disable_chunked_prefill_reasons.append(
-                        "Only models using causal attention support chunked "
-                        "prefill and prefix caching; disabling both."
-                    )
-            elif self.model_config.is_encoder_decoder:
-                from vllm.multimodal import MULTIMODAL_REGISTRY
-
-                self.scheduler_config.max_num_encoder_input_tokens = (
-                    MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
-                )
-                logger.debug(
-                    "Encoder-decoder model detected: setting "
-                    "`max_num_encoder_input_tokens` to encoder length (%s)",
-                    self.scheduler_config.max_num_encoder_input_tokens,
-                )
-                if (
-                    self.model_config.architecture == "WhisperForConditionalGeneration"
-                    and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
-                ):
-                    logger.warning(
-                        "Whisper is known to have issues with "
-                        "forked workers. If startup is hanging, "
-                        "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
-                        "to 'spawn'."
-                    )
+        if self.model_config and self.model_config.is_encoder_decoder:
+            from vllm.multimodal import MULTIMODAL_REGISTRY
 
-        # Final off-switch for CP/APC:
-        # Disable for (a) collected blockers, (b) encoder–decoder, or
-        # (c) explicit CP=False when APC wasn't requested.
-        # Do NOT disable merely because the resolved CP flag is False.
-        apc_requested = (
-            self.cache_config is not None and self.cache_config.enable_prefix_caching
-        )
-        if (
-            disable_chunked_prefill_reasons
-            or (self.model_config is not None and self.model_config.is_encoder_decoder)
-            or (
-                self.scheduler_config.enable_chunked_prefill is False
-                and not apc_requested
+            self.scheduler_config.max_num_encoder_input_tokens = (
+                MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
             )
-        ):
-            for reason in disable_chunked_prefill_reasons:
-                logger.info(reason)
-            self.scheduler_config.chunked_prefill_enabled = False
-            self.scheduler_config.long_prefill_token_threshold = 0
-
-            if self.cache_config is not None:
-                self.cache_config.enable_prefix_caching = False
+            logger.debug(
+                "Encoder-decoder model detected: setting "
+                "`max_num_encoder_input_tokens` to encoder length (%s)",
+                self.scheduler_config.max_num_encoder_input_tokens,
+            )
+            if (
+                self.model_config.architecture == "WhisperForConditionalGeneration"
+                and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
+            ):
+                logger.warning(
+                    "Whisper is known to have issues with "
+                    "forked workers. If startup is hanging, "
+                    "try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
+                    "to 'spawn'."
+                )
 
         if (
             self.kv_events_config is not None
@@ -614,27 +767,65 @@ def __post_init__(self):
 
         # If DCP, ensure the block size is right.
         if self.parallel_config.decode_context_parallel_size > 1:
+            if self.parallel_config.dcp_kv_cache_interleave_size > 1 and (
+                self.parallel_config.cp_kv_cache_interleave_size
+                != self.parallel_config.dcp_kv_cache_interleave_size
+            ):
+                self.parallel_config.cp_kv_cache_interleave_size = (
+                    self.parallel_config.dcp_kv_cache_interleave_size
+                )
+                logger.warning_once(
+                    "cp_kv_cache_interleave_size is overridden by dcp_kv_cache"
+                    "_interleave_size. And dcp-kv-cache-interleave-size will be "
+                    "deprecated when PCP is fully supported."
+                )
             assert (
-                self.parallel_config.dcp_kv_cache_interleave_size
+                self.parallel_config.cp_kv_cache_interleave_size
                 <= self.cache_config.block_size
                 and self.cache_config.block_size
-                % self.parallel_config.dcp_kv_cache_interleave_size
+                % self.parallel_config.cp_kv_cache_interleave_size
                 == 0
             ), (
                 f"Block_size({self.cache_config.block_size}) should be greater "
-                "than or equal to and divisible by dcp_kv_cache_interleave_size "
-                f"({self.parallel_config.dcp_kv_cache_interleave_size})."
+                "than or equal to and divisible by cp_kv_cache_interleave_size "
+                f"({self.parallel_config.cp_kv_cache_interleave_size})."
             )
 
         assert (
-            self.parallel_config.dcp_kv_cache_interleave_size == 1
+            self.parallel_config.cp_kv_cache_interleave_size == 1
             or self.speculative_config is None
-        ), "MTP with dcp_kv_cache_interleave_size > 1 is not supported now."
+        ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now."
 
         # Do this after all the updates to compilation_config.mode
         if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
             self.compilation_config.set_splitting_ops_for_v1()
 
+        if self.compilation_config.pass_config.enable_sequence_parallelism:
+            # With pipeline parallelism or dynamo partitioning,
+            # native rms norm tracing errors due to incorrect residual shape.
+            # Use custom rms norm to unblock. In the future,
+            # the pass will operate on higher-level IR to avoid the issue.
+            # TODO: https://github.com/vllm-project/vllm/issues/27894
+            is_fullgraph = (
+                self.compilation_config.use_inductor_graph_partition
+                or len(self.compilation_config.splitting_ops) == 0
+            )
+            if self.parallel_config.pipeline_parallel_size > 1 or not is_fullgraph:
+                if "-rms_norm" not in self.compilation_config.custom_ops:
+                    self.compilation_config.custom_ops.append("+rms_norm")
+                else:
+                    regime = (
+                        "Dynamo partition"
+                        if not is_fullgraph
+                        else "pipeline parallelism"
+                    )
+                    logger.warning_once(
+                        "Sequence parallelism not supported with"
+                        "native rms_norm when using %s, "
+                        "this will likely lead to an error.",
+                        regime,
+                    )
+
         # final check of cudagraph mode after all possible updates
         if current_platform.is_cuda_alike():
             if (
@@ -929,7 +1120,6 @@ def recalculate_max_model_len(self, max_model_len: int):
         model_config = self.model_config
         max_model_len = model_config.get_and_verify_max_len(max_model_len)
         self.model_config.max_model_len = max_model_len
-        self.scheduler_config.max_model_len = max_model_len
 
     def try_verify_and_update_config(self):
         if self.model_config is None:
@@ -1026,7 +1216,7 @@ def __str__(self):
             f"seed={self.model_config.seed}, "
             f"served_model_name={self.model_config.served_model_name}, "
             f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
-            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
+            f"enable_chunked_prefill={self.scheduler_config.enable_chunked_prefill}, "  # noqa
             f"pooler_config={self.model_config.pooler_config!r}, "
             f"compilation_config={self.compilation_config!r}"
         )
diff --git a/vllm/distributed/device_communicators/pynccl_allocator.py b/vllm/distributed/device_communicators/pynccl_allocator.py
index 401b80046f60..2e5d94de9d01 100644
--- a/vllm/distributed/device_communicators/pynccl_allocator.py
+++ b/vllm/distributed/device_communicators/pynccl_allocator.py
@@ -157,7 +157,7 @@ def __enter__(self):
         if self.disabled:
             return self
         assert self.pynccl_comm is not None, (
-            "Symmetric memory requires pynccl to be initalized"
+            "Symmetric memory requires pynccl to be initialized"
         )
         assert self.pynccl_comm.nccl_version >= 22703, (
             "NCCL version 2.27.3 or higher is required for NCCL symmetric memory"
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 5046cac2e90a..052df19e34d7 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -8,7 +8,7 @@
 from multiprocessing import shared_memory
 from pickle import PickleBuffer
 from threading import Event
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import patch
 
 import torch
@@ -602,13 +602,87 @@ def broadcast_object(self, obj=None):
             return obj
         return self.dequeue()
 
+    @staticmethod
+    def create_from_process_group_single_reader(
+        pg: ProcessGroup,
+        max_chunk_bytes,
+        max_chunks,
+        reader_rank: int = 0,
+        blocking: bool = False,
+    ) -> tuple["MessageQueue", list[Handle]]:
+        """
+        Creates a MessageQueue for a process group with a single reader.
+
+        This method is designed for scenarios where only one process (the reader)
+        will consume messages, and all other processes are writers. It sets up
+        the shared memory buffer and communication handles accordingly, and
+        gathers the handles from all processes to the reader.
+
+        Args:
+            pg (ProcessGroup): The torch distributed process group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            reader_rank (int, optional): The global rank that will act as the reader.
+                Defaults to 0.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to False.
+
+        Returns:
+            tuple[MessageQueue, list[Handle]]:
+            The MessageQueue instance for the calling process,
+            and a list of handles (only non-empty for the reader process).
+        """
+        local_size = torch.cuda.device_count()
+        rank = dist.get_rank()
+        same_node = rank // local_size == reader_rank // local_size
+        buffer_io = MessageQueue(
+            n_reader=1,
+            n_local_reader=1 if same_node else 0,
+            max_chunk_bytes=max_chunk_bytes,
+            max_chunks=max_chunks,
+        )
+        handle = buffer_io.export_handle()
+        handles = [None] * dist.get_world_size(pg) if rank == reader_rank else None
+        dist.gather_object(handle, handles, dst=reader_rank, group=pg)
+        if blocking:
+            buffer_io.wait_until_ready()
+        return buffer_io, cast(list[Handle], handles or [])
+
     @staticmethod
     def create_from_process_group(
         pg: ProcessGroup | StatelessProcessGroup,
         max_chunk_bytes,
         max_chunks,
-        writer_rank=0,
+        writer_rank: int = 0,
+        external_writer_handle=None,
+        blocking: bool = True,
     ) -> "MessageQueue":
+        """
+        Creates a MessageQueue for a distributed process group with one writer and
+        multiple readers.
+
+        This method is designed for scenarios where one process (the writer) sends
+        messages, and all other processes (the readers) receive messages. It sets up
+        the shared memory buffer and socket communication handles accordingly, and
+        broadcasts the handle from the writer to all readers.
+
+        Args:
+            pg (ProcessGroup | StatelessProcessGroup): The torch distributed process
+                group.
+            max_chunk_bytes (int): Maximum size in bytes for each chunk in the buffer.
+            max_chunks (int): Maximum number of chunks in the buffer.
+            writer_rank (int, optional): The global rank that will act as the writer.
+                Defaults to 0.
+            external_writer_handle (Handle, optional): Used when there is a handle
+                from an external Message Queue. If provided, use this handle to init
+                PG writer message queue instead of creating a new one. Defaults to None.
+            blocking (bool, optional): If True, blocks until all processes are ready.
+                Defaults to True.
+
+        Returns:
+            MessageQueue: The MessageQueue instance for the calling process.
+
+        """
         if isinstance(pg, ProcessGroup):
             group_rank = dist.get_rank(pg)
             group_world_size = dist.get_world_size(pg)
@@ -617,23 +691,26 @@ def create_from_process_group(
             group_rank = pg.rank
             group_world_size = pg.world_size
             global_ranks = list(range(pg.world_size))
-
         from vllm.distributed.parallel_state import in_the_same_node_as
 
         status = in_the_same_node_as(pg, source_rank=writer_rank)
-        same_node_ranks = [i for i, s in enumerate(status) if s]
-        n_reader = group_world_size - 1
-        n_local_reader = len(same_node_ranks) - 1
-        local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
-        buffer_io: MessageQueue
         if group_rank == writer_rank:
-            buffer_io = MessageQueue(
-                n_reader=n_reader,
-                n_local_reader=n_local_reader,
-                local_reader_ranks=local_reader_ranks,
-                max_chunk_bytes=max_chunk_bytes,
-                max_chunks=max_chunks,
-            )
+            if external_writer_handle is not None:
+                buffer_io = MessageQueue.create_from_handle(
+                    external_writer_handle, group_rank
+                )
+            else:
+                same_node_ranks = [i for i, s in enumerate(status) if s]
+                n_reader = group_world_size - 1
+                n_local_reader = len(same_node_ranks) - 1
+                local_reader_ranks = [i for i in same_node_ranks if i != writer_rank]
+                buffer_io = MessageQueue(
+                    n_reader=n_reader,
+                    n_local_reader=n_local_reader,
+                    local_reader_ranks=local_reader_ranks,
+                    max_chunk_bytes=max_chunk_bytes,
+                    max_chunks=max_chunks,
+                )
             handle = buffer_io.export_handle()
             if isinstance(pg, ProcessGroup):
                 dist.broadcast_object_list(
@@ -651,5 +728,6 @@ def create_from_process_group(
             else:
                 handle = pg.broadcast_obj(None, writer_rank)
             buffer_io = MessageQueue.create_from_handle(handle, group_rank)
-        buffer_io.wait_until_ready()
+        if blocking:
+            buffer_io.wait_until_ready()
         return buffer_io
diff --git a/vllm/distributed/device_communicators/shm_object_storage.py b/vllm/distributed/device_communicators/shm_object_storage.py
index 2ec33afb8783..4af2caa16b0d 100644
--- a/vllm/distributed/device_communicators/shm_object_storage.py
+++ b/vllm/distributed/device_communicators/shm_object_storage.py
@@ -342,8 +342,8 @@ def __init__(self):
         from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 
         self.encoder = MsgpackEncoder()
-        self.tensor_decoder = MsgpackDecoder(torch.Tensor)
-        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem)
+        self.tensor_decoder = MsgpackDecoder(torch.Tensor, share_mem=False)
+        self.mm_decoder = MsgpackDecoder(MultiModalKwargsItem, share_mem=False)
         self._mm_kwargs_item_cls = MultiModalKwargsItem
 
     def serialize(self, value: Any) -> tuple[bytes | list[bytes], int, bytes, int]:
@@ -368,7 +368,7 @@ def deserialize(self, data_view: memoryview) -> Any:
         # pickle.loads do not read past the end of a pickled object
         # within a large buffer, so we can skip storing the metadata size
         type_name, nbytes, len_arr = pickle.loads(data_view)
-        serialized_data = bytearray(data_view[-nbytes:])
+        serialized_data = data_view[-nbytes:]
 
         if type_name == torch.Tensor.__name__:
             obj = []
diff --git a/vllm/distributed/device_communicators/tpu_communicator.py b/vllm/distributed/device_communicators/tpu_communicator.py
index a7724a86cc6a..fa99078e9ff0 100644
--- a/vllm/distributed/device_communicators/tpu_communicator.py
+++ b/vllm/distributed/device_communicators/tpu_communicator.py
@@ -97,11 +97,3 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
     def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert dim == -1, "TPUs only support dim=-1 for all-gather."
         return xm.all_gather(input_, dim=dim)
-
-
-if USE_TPU_INFERENCE:
-    from tpu_inference.distributed.device_communicators import (
-        TpuCommunicator as TpuInferenceCommunicator,
-    )
-
-    TpuCommunicator = TpuInferenceCommunicator  # type: ignore
diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py
index bfdf51d775bd..e51b32e6f6df 100644
--- a/vllm/distributed/ec_transfer/ec_connector/factory.py
+++ b/vllm/distributed/ec_transfer/ec_connector/factory.py
@@ -5,15 +5,12 @@
 from collections.abc import Callable
 from typing import TYPE_CHECKING
 
-# yapf: disable
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorBase,
     ECConnectorRole,
 )
 from vllm.logger import init_logger
 
-# yapf: enable
-
 if TYPE_CHECKING:
     from vllm.config import ECTransferConfig, VllmConfig
 
diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
new file mode 100644
index 000000000000..e4b4fc92eeaa
--- /dev/null
+++ b/vllm/distributed/eplb/async_worker.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+The async worker that transfers experts in the background.
+"""
+
+import asyncio
+import threading
+from typing import TYPE_CHECKING
+
+import torch
+from torch.distributed import ProcessGroup
+
+from vllm.distributed.parallel_state import get_ep_group
+from vllm.logger import init_logger
+
+from .rebalance_execute import transfer_layer
+
+if TYPE_CHECKING:
+    from .eplb_state import EplbState
+
+logger = init_logger(__name__)
+
+
+def start_async_worker(
+    state: "EplbState",
+    rank_mapping: dict[int, int] | None = None,
+    is_profile: bool = False,
+) -> threading.Thread:
+    ep_group = get_ep_group().device_group
+    rank = ep_group.rank()
+    device_index = state.cuda_device_index
+
+    def thread_target() -> None:
+        assert device_index is not None
+        torch.cuda.set_device(device_index)
+        cuda_stream = torch.cuda.Stream(device=device_index)
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            loop.run_until_complete(
+                transfer_run_periodically(
+                    state=state,
+                    ep_group=ep_group,
+                    is_profile=is_profile,
+                    rank_mapping=rank_mapping,
+                    cuda_stream=cuda_stream,
+                )
+            )
+        except Exception as exc:  # pragma: no cover - diagnostic path
+            logger.exception("async loop error (Rank %d): %s", rank, str(exc))
+        finally:
+            loop.close()
+
+    thread = threading.Thread(target=thread_target, daemon=True)
+    thread.start()
+    return thread
+
+
+async def transfer_run_periodically(
+    state: "EplbState",
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    rank_mapping: dict[int, int] | None = None,
+    cuda_stream: torch.cuda.Stream = None,
+) -> None:
+    while True:
+        await asyncio.to_thread(state.rearrange_event.wait)
+        logger.info("async worker woke up for EPLB transfer")
+
+        for model_state in state.model_states.values():
+            if not model_state.is_async_enabled:
+                continue
+            current_num_layers = model_state.model.num_moe_layers
+            while (
+                model_state.rebalanced
+                and model_state.layer_to_transfer < current_num_layers
+            ):
+                if (
+                    not model_state.ep_buffer_ready
+                    and model_state.rebalanced
+                    and model_state.new_physical_to_logical_map is not None
+                ):
+                    await asyncio.to_thread(model_state.buffer_lock.acquire)
+                    try:
+                        if model_state.layer_to_transfer >= current_num_layers:
+                            break
+
+                        (
+                            model_state.is_unchanged,
+                            model_state.is_received_locally,
+                            model_state.experts_recv_loc,
+                        ) = await transfer_layer(
+                            old_global_expert_indices=model_state.physical_to_logical_map,
+                            new_global_expert_indices=model_state.new_physical_to_logical_map,
+                            expert_weights=model_state.model.expert_weights,
+                            expert_weights_buffer=model_state.expert_buffer,
+                            ep_group=ep_group,
+                            is_profile=is_profile,
+                            layer=model_state.layer_to_transfer,
+                            cuda_stream=cuda_stream,
+                            rank_mapping=rank_mapping,
+                        )
+                        event = torch.cuda.Event(blocking=False)
+                        cuda_stream.record_event(event)
+                        model_state.buffer_ready_event = event
+                        model_state.ep_buffer_ready = 1
+                    finally:
+                        model_state.buffer_lock.release()
+                else:
+                    if not model_state.rebalanced:
+                        break
+                    await asyncio.sleep(0.001)
+
+        state.rearrange_event.clear()
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 526d3ceac7b8..9f8798a96a2f 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -26,6 +26,7 @@
 physical experts.
 """
 
+import threading
 import time
 from collections.abc import Sequence
 from dataclasses import dataclass
@@ -43,8 +44,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import MixtureOfExperts
 
+from .async_worker import start_async_worker
 from .rebalance_algo import rebalance_experts
-from .rebalance_execute import rearrange_expert_weights_inplace
+from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
 
 logger = init_logger(__name__)
 
@@ -132,6 +134,74 @@ class EplbModelState:
     """
     model_name: str
     model: MixtureOfExperts
+    expert_buffer: list[torch.Tensor]
+    """
+    The buffer to store the expert weights during transfer.
+    """
+    buffer_lock: threading.Lock
+    """
+    The lock to protect the expert buffer.
+    """
+    buffer_ready_event: torch.cuda.Event | None
+    """
+    CUDA event recorded when the async worker finishes filling the buffer.
+    The main thread waits on this before consuming the buffer.
+    """
+    ep_buffer_ready: int
+    """
+    The flag indicates whether the expert buffer is ready for transfer.
+    0 or 1.
+    """
+    layer_to_transfer: int
+    """
+    The layer index to transfer in async mode.
+    """
+    rebalanced: bool
+    """
+    The flag indicates whether the experts rebalance have been computed.
+    """
+    pending_global_ready_check: bool
+    """
+    Whether the async EPLB needs to poll peers for buffer readiness.
+    """
+    is_unchanged: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_received_locally: list[bool]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    experts_recv_loc: dict[int, int]
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    The size is same as the num of physical experts in the current layer.
+    """
+    is_async_enabled: bool
+    """
+    The flag indicates whether the EPLB is running in async mode.
+    """
+    cuda_device_index: int | None
+    """
+    CUDA device index for the async EPLB worker thread.
+    """
+    new_physical_to_logical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as physical_to_logical_map
+    """
+    new_logical_to_physical_map: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_to_physical_map
+    """
+    new_logical_replica_count: torch.Tensor | None = None
+    """
+    intermediate variable between `move_to_buffer` and `move_to_workspace`.
+    the size is same as logical_replica_count
+    """
 
 
 class EplbState:
@@ -164,12 +234,31 @@ def __init__(self, parallel_config: ParallelConfig, device: torch.device):
         Otherwise, the rearrangement will hang at collective
         communication calls.
         """
-        self.expert_rearrangement_step: int = 0
+        self.expert_rearrangement_step_interval: int = 0
         """
         Interval for expert rearrangement steps.
         This is a constant and is taken from the config.
         """
-        self.expert_rearrangement_step_interval: int = 0
+        self.is_async: bool = False
+        """
+        The flag indicates whether the EPLB is running in async mode.
+        """
+        self.rearrange_event = threading.Event()
+        """
+        Event to signal when a new rearrangement is needed for the async thread.
+        """
+        self.async_worker: threading.Thread | None = None
+        """
+        Background thread handling async transfers.
+        """
+        self.cuda_device_index: int | None = None
+        """
+        CUDA device index for the async EPLB worker thread.
+        """
+        if self.device.type == "cuda":
+            self.cuda_device_index = self.device.index
+            if self.cuda_device_index is None and torch.cuda.is_available():
+                self.cuda_device_index = torch.cuda.current_device()
 
     @staticmethod
     def build_initial_global_physical_to_logical_map(
@@ -239,6 +328,8 @@ def add_model(
         Build the initial EPLB state.
         """
         self.validate_ep_configuration(model)
+        self.is_async = self.parallel_config.eplb_config.use_async
+
         physical_to_logical_map_list = (
             EplbState.build_initial_global_physical_to_logical_map(
                 model.num_routed_experts,
@@ -368,7 +459,12 @@ def add_model(
             physical_to_logical_map = new_physical_to_logical_map.to(self.device)
             logical_to_physical_map.copy_(new_logical_to_physical_map)
             logical_replica_count.copy_(new_logical_replica_count)
+        else:
+            new_physical_to_logical_map = None
+
+            new_logical_to_physical_map = None
 
+            new_logical_replica_count = None
         model.set_eplb_state(
             expert_load_pass,
             logical_to_physical_map,
@@ -385,15 +481,33 @@ def add_model(
             )
             self.expert_rearrangement_step = 0
 
-        self.model_states[model_config.compute_hash()] = EplbModelState(
-            physical_to_logical_map,
-            logical_to_physical_map,
-            logical_replica_count,
-            expert_load_pass,
-            expert_load_window,
-            model_config.model,
-            model,
+        expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
+
+        model_state = EplbModelState(
+            physical_to_logical_map=physical_to_logical_map,
+            logical_to_physical_map=logical_to_physical_map,
+            logical_replica_count=logical_replica_count,
+            expert_load_pass=expert_load_pass,
+            expert_load_window=expert_load_window,
+            model_name=model_config.model,
+            model=model,
+            expert_buffer=expert_buffer,
+            buffer_lock=threading.Lock(),
+            buffer_ready_event=None,
+            ep_buffer_ready=0,
+            layer_to_transfer=0,
+            rebalanced=False,
+            pending_global_ready_check=False,
+            is_unchanged=[],
+            is_received_locally=[],
+            experts_recv_loc={},
+            is_async_enabled=self.is_async,
+            cuda_device_index=self.cuda_device_index,
+            new_physical_to_logical_map=new_physical_to_logical_map,
+            new_logical_to_physical_map=new_logical_to_physical_map,
+            new_logical_replica_count=new_logical_replica_count,
         )
+        self.model_states[model_config.compute_hash()] = model_state
 
     def step(
         self,
@@ -420,7 +534,7 @@ def step(
             - `max_tokens`: The maximum load across ranks.
             - `balancedness`: The ratio of average load to maximum load.
         """
-
+        ep_group = get_ep_group().device_group
         if is_profile:
             self.rearrange(is_profile=True)
             return
@@ -488,7 +602,49 @@ def step(
         # rearrangement step and perform rearrangement to ensure all ranks are
         # performing collective communication.
         self.expert_rearrangement_step += 1
+
+        if self.is_async:
+            for eplb_model_state in self.model_states.values():
+                if not eplb_model_state.is_async_enabled:
+                    continue
+
+                all_ranks_buffer_ready = False
+                if eplb_model_state.pending_global_ready_check:
+                    all_ranks_buffer_ready = self._all_ranks_buffer_ready(
+                        eplb_model_state
+                    )
+                if (
+                    eplb_model_state.is_async_enabled
+                    and eplb_model_state.ep_buffer_ready
+                    and all_ranks_buffer_ready
+                ):
+                    self.move_to_workspace(
+                        model_state=eplb_model_state,
+                        ep_group=ep_group,
+                        is_profile=is_profile,
+                    )
+                    if (
+                        eplb_model_state.layer_to_transfer
+                        >= eplb_model_state.model.num_moe_layers
+                    ):
+                        self.post_eplb(eplb_model_state, is_profile)
+                        eplb_model_state.rebalanced = False
+                        eplb_model_state.layer_to_transfer = 0
+                        eplb_model_state.pending_global_ready_check = False
+                        logger.info(
+                            "finish async transfer for model %s rank %d layer %d",
+                            eplb_model_state.model_name,
+                            ep_group.rank(),
+                            eplb_model_state.model.num_moe_layers,
+                        )
+
         if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
+            if any(
+                eplb_model_state.is_async_enabled and eplb_model_state.rebalanced
+                for eplb_model_state in self.model_states.values()
+            ):
+                # Still performing asynchronous rearrangement
+                return
             self.expert_rearrangement_step = 0
             self.rearrange()
 
@@ -524,7 +680,11 @@ def rearrange(
         if is_main_rank:
             torch.cuda.synchronize()
             time_start = time.perf_counter()
-            logger.info("Rearranging experts %s...", "(profile)" if is_profile else "")
+            logger.info(
+                "Rearranging experts %s %s...",
+                "(async mode)" if self.is_async else "sync mode",
+                "(profile)" if is_profile else "",
+            )
 
         if global_expert_loads is None:
             # Map the physical expert load to global logical experts
@@ -593,6 +753,7 @@ def rearrange(
         model = eplb_model_state.model
         num_replicas = model.num_physical_experts
         num_groups = model.num_expert_groups
+
         if rank_mapping is not None and len(rank_mapping) == ep_group.size():
             # NOTE(yongji): scale down, we need to rebalance the experts on
             # remaining GPUs, transfer the experts while we haven't shutdown
@@ -608,7 +769,7 @@ def rearrange(
             num_gpus = ep_group.size()
 
         if num_gpus % num_nodes != 0:
-            self.num_nodes = 1
+            num_nodes = 1
             logger.warning_once(
                 f"num_gpus % num_nodes != 0, "
                 "not using hierarchical rearrangement algorithm.\n"
@@ -631,59 +792,215 @@ def rearrange(
                 num_gpus,
             )
 
-            # Update expert weights
-            rearrange_expert_weights_inplace(
-                eplb_model_state.physical_to_logical_map,
-                new_physical_to_logical_map,
-                eplb_model_state.model.expert_weights,
-                ep_group,
-                is_profile,
-                rank_mapping,
-            )
+            if not eplb_model_state.is_async_enabled or is_profile:
+                # Update expert weights
+                rearrange_expert_weights_inplace(
+                    eplb_model_state.physical_to_logical_map,
+                    new_physical_to_logical_map,
+                    eplb_model_state.model.expert_weights,
+                    ep_group,
+                    is_profile,
+                    rank_mapping,
+                )
 
-            if not is_profile:
-                if (
-                    eplb_model_state.physical_to_logical_map.shape[1]
-                    != new_physical_to_logical_map.shape[1]
-                ):
-                    eplb_model_state.physical_to_logical_map = (
-                        new_physical_to_logical_map.to(
-                            eplb_model_state.physical_to_logical_map.device
+                if not is_profile:
+                    if (
+                        eplb_model_state.physical_to_logical_map.shape[1]
+                        != new_physical_to_logical_map.shape[1]
+                    ):
+                        eplb_model_state.physical_to_logical_map = (
+                            new_physical_to_logical_map.to(
+                                eplb_model_state.physical_to_logical_map.device
+                            )
                         )
+                    else:
+                        eplb_model_state.physical_to_logical_map.copy_(
+                            new_physical_to_logical_map
+                        )
+                    max_physical_slots = new_logical_to_physical_map.shape[-1]
+                    assert (
+                        max_physical_slots
+                        <= eplb_model_state.logical_to_physical_map.shape[-1]
                     )
-                else:
-                    eplb_model_state.physical_to_logical_map.copy_(
-                        new_physical_to_logical_map
+                    new_logical_to_physical_map = torch.nn.functional.pad(
+                        new_logical_to_physical_map,
+                        (
+                            0,
+                            eplb_model_state.logical_to_physical_map.shape[-1]
+                            - max_physical_slots,
+                        ),
+                        value=-1,
                     )
-                max_physical_slots = new_logical_to_physical_map.shape[-1]
-                assert (
-                    max_physical_slots
-                    <= eplb_model_state.logical_to_physical_map.shape[-1]
-                )
-                new_logical_to_physical_map = torch.nn.functional.pad(
+                    eplb_model_state.logical_to_physical_map.copy_(
+                        new_logical_to_physical_map
+                    )
+                    eplb_model_state.logical_replica_count.copy_(
+                        new_logical_replica_count
+                    )
+                if is_main_rank:
+                    assert time_start is not None
+                    torch.cuda.synchronize()
+                    time_end = time.perf_counter()
+                    logger.info(
+                        "Rearranged experts%sin %.2f seconds.",
+                        " (profile) " if is_profile else " ",
+                        time_end - time_start,
+                    )
+            else:
+                device = eplb_model_state.physical_to_logical_map.device
+                new_physical = new_physical_to_logical_map.to(device)
+                max_slots = eplb_model_state.logical_to_physical_map.shape[-1]
+                padded_logical = torch.nn.functional.pad(
                     new_logical_to_physical_map,
-                    (
-                        0,
-                        eplb_model_state.logical_to_physical_map.shape[-1]
-                        - max_physical_slots,
-                    ),
+                    (0, max(0, max_slots - new_logical_to_physical_map.shape[-1])),
                     value=-1,
+                ).to(eplb_model_state.logical_to_physical_map.device)
+                new_replica = new_logical_replica_count.to(
+                    eplb_model_state.logical_replica_count.device
                 )
-                eplb_model_state.logical_to_physical_map.copy_(
-                    new_logical_to_physical_map
-                )
-                eplb_model_state.logical_replica_count.copy_(new_logical_replica_count)
 
-        if is_main_rank:
-            assert time_start is not None
-            torch.cuda.synchronize()
-            time_end = time.perf_counter()
+                eplb_model_state.new_physical_to_logical_map = new_physical
+                eplb_model_state.new_logical_to_physical_map = padded_logical
+                eplb_model_state.new_logical_replica_count = new_replica
+
+                eplb_model_state.rebalanced = True
+                eplb_model_state.layer_to_transfer = 0
+                eplb_model_state.pending_global_ready_check = True
+
+        # Signal async thread to start transferring layers
+        if self.is_async and (not is_profile):
+            self.rearrange_event.set()
+        return None
+
+    def start_async_loop(
+        self,
+        rank_mapping: dict[int, int] | None = None,
+        is_profile: bool = False,
+    ):
+        if not self.is_async:
+            return
+        if self.async_worker is None:
+            self.async_worker = start_async_worker(
+                self,
+                rank_mapping=rank_mapping,
+                is_profile=is_profile,
+            )
+
+    def _update_layer_mapping_from_new(
+        self, model_state: EplbModelState, layer: int
+    ) -> None:
+        if (
+            model_state.new_physical_to_logical_map is None
+            or model_state.new_logical_to_physical_map is None
+            or model_state.new_logical_replica_count is None
+        ):
+            return
+
+        target_device = model_state.physical_to_logical_map.device
+        new_physical = model_state.new_physical_to_logical_map
+        if model_state.physical_to_logical_map.shape[1] != new_physical.shape[1]:
+            model_state.physical_to_logical_map = new_physical.to(target_device)
+        else:
+            model_state.physical_to_logical_map[layer].copy_(
+                new_physical[layer].to(target_device)
+            )
+
+        logical_device = model_state.logical_to_physical_map.device
+        new_logical = model_state.new_logical_to_physical_map[layer].to(logical_device)
+        max_slots = model_state.logical_to_physical_map.shape[-1]
+        slot_delta = max_slots - new_logical.shape[-1]
+        if slot_delta > 0:
+            new_logical = torch.nn.functional.pad(
+                new_logical, (0, slot_delta), value=-1
+            )
+        model_state.logical_to_physical_map[layer].copy_(new_logical)
+
+        replica_device = model_state.logical_replica_count.device
+        model_state.logical_replica_count[layer].copy_(
+            model_state.new_logical_replica_count[layer].to(replica_device)
+        )
+
+    def _all_ranks_buffer_ready(self, model_state: EplbModelState) -> bool:
+        parallel_state = get_ep_group()
+        cpu_group = getattr(parallel_state, "cpu_group", None)
+        if cpu_group is not None and cpu_group.size() > 1:
+            flag = torch.tensor(
+                (int(model_state.ep_buffer_ready),), dtype=torch.int32, device="cpu"
+            )
+            all_reduce(flag, group=cpu_group)
+            return int(flag.item()) == cpu_group.size()
+
+        device_group = parallel_state.device_group
+        if device_group.size() <= 1:
+            return bool(model_state.ep_buffer_ready)
+
+        device = getattr(
+            parallel_state, "device", model_state.physical_to_logical_map.device
+        )
+        flag = torch.tensor(
+            (int(model_state.ep_buffer_ready),), dtype=torch.int32, device=device
+        )
+        all_reduce(flag, group=device_group)
+        return int(flag.item()) == device_group.size()
+
+    def move_to_workspace(
+        self,
+        model_state: EplbModelState,
+        ep_group: ProcessGroup,
+        is_profile: bool = False,
+    ):
+        if not model_state.buffer_lock.acquire(blocking=False):
+            return
+        try:
+            assert model_state.new_physical_to_logical_map is not None
+            device_index = model_state.cuda_device_index or self.cuda_device_index
+            if model_state.buffer_ready_event is not None and device_index is not None:
+                stream = torch.cuda.current_stream(device=device_index)
+                stream.wait_event(model_state.buffer_ready_event)
+                model_state.buffer_ready_event = None
+            move_from_buffer(
+                expert_weights=model_state.model.expert_weights[
+                    model_state.layer_to_transfer
+                ],
+                expert_weights_buffer=model_state.expert_buffer,
+                is_unchanged=model_state.is_unchanged,
+                is_received_locally=model_state.is_received_locally,
+                experts_recv_loc=model_state.experts_recv_loc,
+                new_indices=model_state.new_physical_to_logical_map[
+                    model_state.layer_to_transfer
+                ].tolist(),
+                ep_group=ep_group,
+            )
+            transferred_layer = model_state.layer_to_transfer
+            self._update_layer_mapping_from_new(model_state, transferred_layer)
+            # After the main thread consumes, advance layer_to_transfer
+            model_state.layer_to_transfer += 1
+            model_state.ep_buffer_ready = 0
             logger.info(
-                "Rearranged experts%sin %.2f seconds.",
-                " (profile) " if is_profile else " ",
-                time_end - time_start,
+                "model %s successfully move_to_workspace layer %d",
+                model_state.model_name,
+                transferred_layer,
             )
-        return None
+        finally:
+            try:
+                model_state.buffer_lock.release()
+            except Exception as e:
+                logger.error(
+                    "Rank %d: buffer_lock release failed in move_to_workspace: %s",
+                    ep_group.rank(),
+                    str(e),
+                )
+
+    def post_eplb(self, model_state: EplbModelState, is_profile: bool = False) -> None:
+        assert model_state.new_physical_to_logical_map is not None
+        assert model_state.new_logical_to_physical_map is not None
+        assert model_state.new_logical_replica_count is not None
+        if not is_profile:
+            for layer_idx in range(model_state.physical_to_logical_map.shape[0]):
+                self._update_layer_mapping_from_new(model_state, layer_idx)
+        model_state.new_physical_to_logical_map = None
+        model_state.new_logical_to_physical_map = None
+        model_state.new_logical_replica_count = None
 
     @staticmethod
     def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py
index 5c1efbaf03ba..376dad8a72ef 100644
--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -100,18 +100,19 @@ def get_ep_ranks_with_expert(
     return ranks_to_send, ranks_to_recv_actual
 
 
-def shuffle_layer(
+def move_to_buffer(
     num_local_experts: int,
-    ep_rank: int,
     old_indices: Sequence[int],
     new_indices: Sequence[int],
     expert_weights: Iterable[torch.Tensor],
     expert_weights_buffer: Sequence[torch.Tensor],
+    cuda_stream: torch.cuda.Stream | None,
     ep_group: ProcessGroup,
-) -> None:
+) -> tuple[list[bool], list[bool], dict[int, int]]:
     """
     Perform expert weights rearrangement of one layer.
     """
+    ep_rank = ep_group.rank()
     local2global = partial(
         idx_local_to_global,
         local_cnt=num_local_experts,
@@ -137,7 +138,8 @@ def shuffle_layer(
             if old_indices[src_global] == new_indices[dst_global]:
                 is_received_locally[dst] = True
                 for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                    buffer[dst].copy_(weight[src])
+                    with torch.cuda.stream(cuda_stream):
+                        buffer[dst].copy_(weight[src], non_blocking=True)
 
     p2p_ops: list[P2POp] = []
 
@@ -225,25 +227,115 @@ def shuffle_layer(
         ]
 
     # 4. Execute the P2P operations. The real communication happens here.
-    if p2p_ops:
+    if p2p_ops and cuda_stream is not None:
+        with torch.cuda.stream(cuda_stream):
+            reqs = batch_isend_irecv(p2p_ops)
+            for req in reqs:
+                req.wait()
+    elif p2p_ops:
         reqs = batch_isend_irecv(p2p_ops)
         for req in reqs:
             req.wait()
+    # wait for the communication to finish
+    return is_unchanged, is_received_locally, experts_recv_loc
+
+
+def move_from_buffer(
+    expert_weights: Iterable[torch.Tensor],
+    expert_weights_buffer: list[torch.Tensor],
+    is_unchanged: list[bool],
+    is_received_locally: list[bool],
+    experts_recv_loc: dict[int, int],
+    new_indices: Sequence[int],
+    ep_group: ProcessGroup,
+) -> None:
+    ep_rank = ep_group.rank()
+    num_local_experts = len(is_unchanged)
+
+    local2global = partial(
+        idx_local_to_global, local_cnt=num_local_experts, ep_rank=ep_rank
+    )
 
-    # 5. Copy the weights from the buffer back to the original weights.
     for dst in range(num_local_experts):
         if is_unchanged[dst]:
             continue
         if is_received_locally[dst]:
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[dst])
+                weight[dst].copy_(buffer[dst], non_blocking=True)
         else:
             expert = new_indices[local2global(dst)]
             if expert == -1:
                 continue
             src = experts_recv_loc[expert]
             for weight, buffer in zip(expert_weights, expert_weights_buffer):
-                weight[dst].copy_(buffer[src])
+                weight[dst].copy_(buffer[src], non_blocking=True)
+
+
+async def transfer_layer(
+    old_global_expert_indices: torch.Tensor,
+    new_global_expert_indices: torch.Tensor,
+    expert_weights: Sequence[Iterable[torch.Tensor]],
+    expert_weights_buffer: Sequence[torch.Tensor],
+    ep_group: ProcessGroup,
+    is_profile: bool = False,
+    layer: int = 0,
+    cuda_stream: torch.cuda.Stream | None = None,
+    rank_mapping: dict[int, int] | None = None,
+) -> tuple[list[bool], list[bool], dict[int, int]]:
+    """
+    Rearranges the expert weights in place according to the new expert indices.
+
+    The value of the indices arguments are logical indices of the experts,
+    while keys are physical.
+
+    Args:
+        old_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        new_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
+        expert_weights: A sequence of shape (num_moe_layers)(weight_count)
+            of tensors of shape (num_local_physical_experts, hidden_size_i).
+            For example, a linear layer may have up and down projection,
+            so weight_count = 2. Each weight's hidden size can be different.
+        ep_group: The device process group for expert parallelism.
+        is_profile (bool): If `True`, do not perform any actual weight copy.
+            This is used during profile run, where we only perform dummy
+            communications to reserve enough memory for the buffers.
+    """
+    ep_size = ep_group.size()
+    if rank_mapping is not None:
+        if len(rank_mapping) == ep_group.size():
+            # scale down
+            new_global_expert_indices = _map_new_expert_indices_with_rank_mapping(
+                new_global_expert_indices,
+                rank_mapping,
+            )
+        else:
+            # scale up
+            old_global_expert_indices = _map_old_expert_indices_with_rank_mapping(
+                old_global_expert_indices,
+                rank_mapping,
+                ep_group.size(),
+            )
+
+    assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1]
+    num_moe_layers, num_physical_experts = old_global_expert_indices.shape
+    assert len(expert_weights) == num_moe_layers
+    num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
+    assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
+    assert num_physical_experts == ep_size * num_local_physical_experts
+    # A buffer to hold the expert weights in one layer during the exchange.
+    # NOTE: Currently we assume the same weights across different layers
+    # have the same shape.
+
+    is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+        num_local_experts=num_local_physical_experts,
+        old_indices=old_global_expert_indices[layer].tolist(),
+        new_indices=new_global_expert_indices[layer].tolist(),
+        expert_weights=expert_weights[layer],
+        expert_weights_buffer=expert_weights_buffer,
+        cuda_stream=cuda_stream,
+        ep_group=ep_group,
+    )
+    return is_unchanged, is_received_locally, experts_recv_loc
 
 
 def rearrange_expert_weights_inplace(
@@ -296,7 +388,6 @@ def rearrange_expert_weights_inplace(
     num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
     assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
 
-    ep_rank = ep_group.rank()
     ep_size = ep_group.size()
     assert num_physical_experts == ep_size * num_local_physical_experts
 
@@ -329,14 +420,24 @@ def rearrange_expert_weights_inplace(
     torch.cuda.synchronize()
 
     for layer in range(num_moe_layers):
-        shuffle_layer(
-            num_local_physical_experts,
-            ep_rank,
-            old_global_expert_indices_cpu[layer].tolist(),
-            new_global_expert_indices_cpu[layer].tolist(),
-            expert_weights[layer],
-            expert_weights_buffer,
-            ep_group,
+        is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
+            num_local_experts=num_local_physical_experts,
+            old_indices=old_global_expert_indices_cpu[layer].tolist(),
+            new_indices=new_global_expert_indices_cpu[layer].tolist(),
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            cuda_stream=None,
+            ep_group=ep_group,
+        )
+
+        move_from_buffer(
+            expert_weights=expert_weights[layer],
+            expert_weights_buffer=expert_weights_buffer,
+            is_unchanged=is_unchanged,
+            is_received_locally=is_received_locally,
+            experts_recv_loc=experts_recv_loc,
+            new_indices=new_global_expert_indices[layer].tolist(),
+            ep_group=ep_group,
         )
 
 
@@ -428,4 +529,4 @@ def _map_new_expert_indices_with_rank_mapping(
     return mapped_expert_indices
 
 
-__all__ = ["rearrange_expert_weights_inplace"]
+__all__ = ["transfer_layer", "move_from_buffer"]
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index f85eb414b222..d37ec25675b7 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -38,16 +38,16 @@
 import enum
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Iterable
-from typing import TYPE_CHECKING, Any, Literal, Optional
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.logger import init_logger
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -142,6 +142,18 @@ class KVConnectorMetadata(ABC):  # noqa: B024
 
 
 class KVConnectorBase_V1(ABC):
+    """
+    Base class for KV connectors.
+
+    Attributes:
+        prefer_cross_layer_blocks (bool): Indicates whether this connector
+            prefers KV blocks that hold KV data for all layers (for speeding
+            up KV data transfers).
+            Defaults to False.
+    """
+
+    prefer_cross_layer_blocks: ClassVar[bool] = False
+
     def __init__(
         self,
         vllm_config: "VllmConfig",
@@ -226,6 +238,23 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         """
         return
 
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        """
+        Initialize with a single KV cache tensor used by all layers.
+        The first dimension should be num_layers.
+        This function will only be called for models with uniform layers,
+        and only if the prefers_cross_layer_blocks is set to True.
+        Only one of the functions
+        {register_kv_caches, register_cross_layers_kv_cache} will be called.
+
+        Args:
+            kv_cache: a cross-layers kv cache tensor
+            attn_backend: The attention backend that corresponds to all layers
+        """
+        return
+
     def set_host_xfer_buffer_ops(self, copy_operation: CopyBlocksOp):
         """
         Set the xPU-specific ops for copying KV between host and device.
@@ -536,7 +565,7 @@ def build_prom_metrics(
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> Optional["KVConnectorPromMetrics"]:
         """
         Create a KVConnectorPromMetrics subclass which should register
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
index 9cd7d93c92fa..e9b2bd392b0e 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/decode_bench_connector.py
@@ -36,6 +36,7 @@
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
     KVConnectorRole,
@@ -45,7 +46,6 @@
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -117,7 +117,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         # This connector doesn't save KV cache (benchmarking only)
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
index 0c24a53fb754..30da424ddcca 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py
@@ -7,6 +7,7 @@
     LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl,
 )
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -17,7 +18,6 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -91,7 +91,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
index ab2eeed9f6b8..6acfb73997f2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/multi_process_adapter.py
@@ -310,7 +310,6 @@ def get_finished(
                     request_id,
                     result,
                 )
-            logger.info("Retrieve request for request_id=%s finished", request_id)
 
         # Remove the finished requests from the tracking dicts
         for request_id in finished_stores:
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
index 94572b02fa87..15ac5b049fce 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py
@@ -29,6 +29,7 @@
 from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer
 from lmcache.v1.plugin.plugin_launcher import PluginLauncher
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -50,7 +51,6 @@
 from vllm.version import __version__ as VLLM_VERSION
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_manager import KVCacheManager
@@ -915,7 +915,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """Start saving the a layer of KV cache from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
index 55831dc56c80..78256a6552c2 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_mp_connector.py
@@ -7,8 +7,10 @@
 
 import torch
 import zmq
+from lmcache.integration.vllm.utils import mla_enabled
 from lmcache.utils import init_logger as lmcache_init_logger
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -25,7 +27,6 @@
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.config import VllmConfig
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
@@ -60,17 +61,44 @@ def reformat_block_ids(block_ids: tuple[list[int], ...] | None) -> list[int]:
     return block_ids[0]
 
 
+def extract_world_size_and_kv_rank(
+    world_size: int,
+    rank: int,
+    vllm_config: VllmConfig,
+) -> tuple[int, int]:
+    """
+    Convert the rank for the MLA.
+    """
+    use_mla = mla_enabled(vllm_config.model_config)
+    if not use_mla:
+        return world_size, rank
+    else:
+        # Tensor parallel does not change the KV caches for MLA models.
+        # So we need to "exclude" the effect of TP on rank and world size
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+        # vLLM constructs TP groups first, and then construct other
+        # parallel groups on top of TP groups.
+        # for example, TP=4, PP=2,
+        # TP group: [0, 1, 2, 3], [4, 5, 6, 7]
+        # PP group: [0, 4], [1, 5], [2, 6], [3, 7]
+        # So we can "exclude" the effect of TP by rank // tp_size.
+        return world_size // tp_size, rank // tp_size
+
+
 def create_scheduler_adapter(
     server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
 ) -> LMCacheMPSchedulerAdapter:
-    # TODO: have a helper function to calculate the correct rank and
-    # world size for the MLA and other models
+    world_size, kv_rank = extract_world_size_and_kv_rank(
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config,
+    )
     return LMCacheMPSchedulerAdapter(
         server_url,
         zmq_context,
         vllm_config.model_config.model,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
+        world_size,
+        kv_rank,
         vllm_config.cache_config.block_size,
     )
 
@@ -78,14 +106,17 @@ def create_scheduler_adapter(
 def create_worker_adapter(
     server_url: str, zmq_context: zmq.Context, vllm_config: VllmConfig
 ) -> LMCacheMPWorkerAdapter:
-    # TODO: have a helper function to calculate the correct rank and
-    # world size for the MLA and other models
+    world_size, kv_rank = extract_world_size_and_kv_rank(
+        vllm_config.parallel_config.world_size,
+        vllm_config.parallel_config.rank,
+        vllm_config,
+    )
     return LMCacheMPWorkerAdapter(
         server_url,
         zmq_context,
         vllm_config.model_config.model,
-        vllm_config.parallel_config.world_size,
-        vllm_config.parallel_config.rank,
+        world_size,
+        kv_rank,
         vllm_config.cache_config.block_size,
     )
 
@@ -438,9 +469,6 @@ def start_load_kv(self, forward_context: "ForwardContext", **kwargs: Any) -> Non
             ops.append(meta.op)
 
         if len(request_ids) > 0:
-            logger.info(
-                "HERE! SUBMITTING THE BATCHED RETRIEVE REQUESTS %s", request_ids
-            )
             self.worker_adapter.batched_submit_retrieve_requests(
                 request_ids, ops, event
             )
@@ -462,7 +490,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """
@@ -778,7 +806,7 @@ def build_prom_metrics(
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> Optional["KVConnectorPromMetrics"]:
         """
         Create a KVConnectorPromMetrics subclass which should register
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
index d6ea4f1ab4cf..eb8342eb7129 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@@ -52,13 +52,13 @@ def is_empty(self) -> bool:
 
 
 class KVConnectorLogging:
-    def __init__(self, kv_tranfer_config: KVTransferConfig):
+    def __init__(self, kv_transfer_config: KVTransferConfig | None):
         # This should be called on frontend process.
         assert not has_kv_transfer_group()
         # Instantiate the connector's stats class.
-        if kv_tranfer_config and kv_tranfer_config.kv_connector:
+        if kv_transfer_config and kv_transfer_config.kv_connector:
             self.connector_cls = KVConnectorFactory.get_connector_class(
-                kv_tranfer_config
+                kv_transfer_config
             )
         self.reset()
 
@@ -120,7 +120,7 @@ def __init__(
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self._kv_transfer_config = vllm_config.kv_transfer_config
         self._gauge_cls = metric_types[Gauge]
@@ -129,7 +129,7 @@ def __init__(
         self._labelnames = labelnames
         self._per_engine_labelvalues = per_engine_labelvalues
 
-    def make_per_engine(self, metric: PromMetric) -> PromMetric:
+    def make_per_engine(self, metric: PromMetric) -> dict[int, PromMetric]:
         """
         Create a per-engine child of a prometheus_client.Metric with
         the appropriate labels set. The parent metric must be created
@@ -165,7 +165,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self.prom_metrics: KVConnectorPromMetrics | None = None
         kv_transfer_config = vllm_config.kv_transfer_config
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index c9d08e9b78ed..51d5df6c6ba1 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -7,6 +7,7 @@
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
@@ -27,7 +28,6 @@
 from vllm.v1.outputs import KVConnectorOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.distributed.kv_events import KVCacheEvent
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
@@ -85,7 +85,7 @@ def __init__(
         vllm_config: "VllmConfig",
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
         prom_metrics: dict[str, KVConnectorPromMetrics],
     ):
         super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
@@ -216,7 +216,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         for c in self._connectors:
@@ -434,7 +434,7 @@ def build_prom_metrics(
         vllm_config: "VllmConfig",
         metric_types: dict[type["PromMetric"], type["PromMetricT"]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> KVConnectorPromMetrics:
         prom_metrics: dict[str, KVConnectorPromMetrics] = {}
         for connector_cls, temp_config in cls._get_connector_classes_and_configs(
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 42433c717cf2..41e32bb73d40 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -4,7 +4,6 @@
 import copy
 import logging
 import math
-import os
 import queue
 import threading
 import time
@@ -21,7 +20,7 @@
 import zmq
 
 from vllm import envs
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.selector import get_attn_backend
 from vllm.config import VllmConfig
@@ -49,9 +48,9 @@
 from vllm.utils.network_utils import make_zmq_path, make_zmq_socket
 from vllm.v1.attention.backends.utils import get_kv_cache_layout
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
     from vllm.v1.request import Request
@@ -107,11 +106,14 @@ class NixlAgentMetadata(KVConnectorHandshakeMetadata):
     block_lens: list[int]
     attn_backend_name: str
     kv_cache_layout: str
+    block_size: int
 
 
 @dataclass
 class ReqMeta:
     local_block_ids: list[int]
+    # To be used when logical block size does not match the kernel block size
+    local_physical_block_ids: list[int]
     remote_block_ids: list[int]
     remote_host: str
     remote_port: int
@@ -139,6 +141,7 @@ def add_new_req(
         assert load_remote_cache ^ save_to_host
         _req = ReqMeta(
             local_block_ids=local_block_ids,
+            local_physical_block_ids=local_block_ids,
             remote_block_ids=kv_transfer_params["remote_block_ids"],
             remote_engine_id=kv_transfer_params["remote_engine_id"],
             remote_host=kv_transfer_params["remote_host"],
@@ -285,7 +288,7 @@ def build_prom_metrics(
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ) -> KVConnectorPromMetrics:
         return NixlPromMetrics(
             vllm_config, metric_types, labelnames, per_engine_labelvalues
@@ -304,7 +307,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs,
     ) -> None:
         """NixlConnector does not save explicitly."""
@@ -672,12 +675,13 @@ class TpKVTopology:
         mapping between local and remote TP workers.
         """
 
-        tp_size: int
         tp_rank: int
         remote_tp_size: dict[EngineId, int]
         is_mla: bool
         total_num_kv_heads: int
         attn_backend: type[AttentionBackend]
+        engine_id: EngineId
+        remote_block_size: dict[EngineId, int]
 
         def __post_init__(self):
             # Figure out whether the first dimension of the cache is K/V
@@ -705,6 +709,14 @@ def split_k_and_v(self) -> bool:
                 self.is_mla or self._use_pallas or self.is_kv_layout_blocks_first
             )
 
+        @property
+        def tp_size(self) -> int:
+            return self.remote_tp_size[self.engine_id]
+
+        @property
+        def block_size(self) -> int:
+            return self.remote_block_size[self.engine_id]
+
         def tp_ratio(
             self,
             remote_tp_size: int,
@@ -721,6 +733,19 @@ def tp_ratio(
             )
             return self.tp_size // remote_tp_size
 
+        def block_size_ratio(
+            self,
+            remote_block_size: int,
+        ) -> float:
+            """
+            Calculate the block size ratio between local and remote TP.
+            """
+            assert self.block_size % remote_block_size == 0, (
+                f"Local block size {self.block_size} is not divisible "
+                f"by remote block size {remote_block_size} or vice versa."
+            )
+            return self.block_size // remote_block_size
+
         def tp_ratio_from_engine_id(
             self,
             remote_engine_id: EngineId,
@@ -728,6 +753,13 @@ def tp_ratio_from_engine_id(
             remote_tp_size = self.remote_tp_size[remote_engine_id]
             return self.tp_ratio(remote_tp_size)
 
+        def block_size_ratio_from_engine_id(
+            self,
+            remote_engine_id: EngineId,
+        ) -> float:
+            remote_block_size = self.remote_block_size[remote_engine_id]
+            return self.block_size_ratio(remote_block_size)
+
         def is_kv_replicated(self, engine_id: EngineId) -> bool:
             """
             Whether the KV cache is replicated across TP workers due to the
@@ -776,9 +808,6 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         self.nixl_backends = vllm_config.kv_transfer_config.get_from_extra_config(
             "backends", ["UCX"]
         )
-        # TODO temporary, once nixl allows for telemetry flag in config
-        # (next release), we can remove this env var.
-        os.environ["NIXL_TELEMETRY_ENABLE"] = "1"
 
         # Agent.
         non_ucx_backends = [b for b in self.nixl_backends if b != "UCX"]
@@ -794,10 +823,11 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         if nixl_agent_config is None:
             config = None
         else:
+            # Enable telemetry by default for NIXL 0.7.1 and above.
             config = (
-                nixl_agent_config(backends=self.nixl_backends)
+                nixl_agent_config(backends=self.nixl_backends, capture_telemetry=True)
                 if len(non_ucx_backends) > 0
-                else nixl_agent_config(num_threads=num_threads)
+                else nixl_agent_config(num_threads=num_threads, capture_telemetry=True)
             )
 
         self.nixl_wrapper = NixlWrapper(str(uuid.uuid4()), config)
@@ -862,6 +892,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
 
         # nixl_prepped_dlist_handle.
         self.src_xfer_side_handle: int = 0
+        self.src_xfer_side_handles: dict[int, int] = {}
         # Map of engine_id -> nixl_prepped_dlist_handle (int)].
         self.dst_xfer_side_handles: dict[EngineId, int] = {}
 
@@ -921,20 +952,23 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str):
         logger.debug("Detected kv cache layout %s", self.kv_cache_layout)
 
         self._tp_size: dict[EngineId, int] = {self.engine_id: self.world_size}
+        self._block_size: dict[EngineId, int] = {self.engine_id: self.block_size}
         # With heterogeneous TP, P must wait for all assigned D TP workers to
         # finish reading before safely freeing the blocks.
         self.consumer_notification_counts_by_req = defaultdict[ReqId, int](int)
         self.xfer_stats = NixlKVConnectorStats()
 
         self.kv_topo = self.TpKVTopology(
-            tp_size=self.world_size,
             tp_rank=self.tp_rank,
+            engine_id=self.engine_id,
             remote_tp_size=self._tp_size,  # shared state
+            remote_block_size=self._block_size,  # shared state
             is_mla=self.use_mla,
             total_num_kv_heads=self.model_config.get_total_num_kv_heads(),
             attn_backend=backend,
         )
         self._use_pallas = self.kv_topo._use_pallas
+        self._physical_blocks_per_logical_kv_block = 1
 
     def _nixl_handshake(
         self,
@@ -982,9 +1016,13 @@ def _nixl_handshake(
                 )
 
             # Register Remote agent.
+            assert metadata.block_size <= self.block_size, (
+                "nP > nD is not supported yet."
+            )
             remote_agent_name = self.add_remote_agent(
                 metadata, p_remote_rank, remote_tp_size
             )
+
             setup_agent_time = time.perf_counter()
             logger.debug(
                 "NIXL handshake: add agent took: %s",
@@ -1000,10 +1038,12 @@ def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> Non
         NOT directly supported by NIXL (e.g., tpu)
         """
         xfer_buffers: dict[str, torch.Tensor] = {}
+        inv_order = [0, 1, 3, 2, 4]
         try:
             for layer_name, kv_cache in kv_caches.items():
                 kv_shape = kv_cache.shape
                 kv_dtype = kv_cache.dtype
+                permute_shape = False
                 if (
                     self.kv_cache_layout == "NHD"
                     and self.vllm_config.kv_transfer_config is not None
@@ -1017,10 +1057,20 @@ def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> Non
                     # Since NHD will not support Decode/Prefill TP_ratio > 1,
                     # we can leverage host_buffer for permute
                     self.host_buffer_kv_cache_layout = "HND"
-                    kv_shape = tuple(kv_shape[i] for i in [0, 1, 3, 2, 4])
+                    kv_shape = (
+                        tuple(kv_shape[i] for i in inv_order)
+                        if not self.use_mla
+                        else kv_shape
+                    )
+                    permute_shape = not self.use_mla
+
                 xfer_buffers[layer_name] = torch.empty(
                     kv_shape, dtype=kv_dtype, device="cpu"
                 )
+                if permute_shape:
+                    xfer_buffers[layer_name] = xfer_buffers[layer_name].permute(
+                        inv_order
+                    )
         except MemoryError as e:
             logger.error("NIXLConnectorWorker gets %s.", e)
             raise
@@ -1119,6 +1169,14 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         # to better exploit the memory layout (ie num_blocks is the first dim).
         split_k_and_v = self.kv_topo.split_k_and_v
         tensor_size_bytes = None
+
+        # TODO (NickLucche): Get kernel_block_size in a cleaner way
+        # NHD default "view" for non-MLA cache
+        if self.device_type == "cpu":
+            block_size_position = -2
+        else:
+            block_size_position = -2 if self.use_mla else -3
+
         # Enable different block lengths for different layers when MLA is used.
         self.block_len_per_layer = list[int]()
         self.slot_size_per_layer = list[int]()  # HD bytes in kv terms
@@ -1133,6 +1191,21 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
                 if base_addr in seen_base_addresses:
                     continue
 
+                kernel_block_size = cache.shape[block_size_position]
+
+                if self.block_size != kernel_block_size:
+                    logger.info_once(
+                        "User-specified logical block size (%s) does not match"
+                        " physical kernel block size (%s). Using the latter. ",
+                        self.block_size,
+                        kernel_block_size,
+                    )
+                    self._physical_blocks_per_logical_kv_block = (
+                        self.block_size // kernel_block_size
+                    )
+                    self.block_size = kernel_block_size
+                    self._block_size[self.engine_id] = kernel_block_size
+
                 seen_base_addresses.append(base_addr)
                 curr_tensor_size_bytes = cache.numel() * cache.element_size()
 
@@ -1196,43 +1269,10 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             self.num_regions *= 2
 
         # Register local/src descr for NIXL xfer.
-        blocks_data = []
-        for i, base_addr in enumerate(seen_base_addresses):
-            kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
-            # NOTE With heter-TP, more blocks are prepared than what are
-            # needed as self.num_blocks >= nixl_agent_meta.num_blocks. We
-            # could create fewer, but then _get_block_descs_ids needs to
-            # select agent_meta.num_blocks instead of self.num_blocks for
-            # local descr, and that makes handling regular flow less clean.
-            for block_id in range(self.num_blocks):
-                block_offset = block_id * self.block_len_per_layer[i]
-                addr = base_addr + block_offset
-                # (addr, len, device id)
-                blocks_data.append((addr, kv_block_len, self.device_id))
+        self.seen_base_addresses = seen_base_addresses
+        self.src_xfer_side_handle = self.register_local_xfer_handler(self.block_size)
 
-            if self.kv_topo.is_kv_layout_blocks_first:
-                # Separate and interleave K/V regions to maintain the same
-                # descs ordering. This is needed for selecting contiguous heads
-                # when split across TP ranks.
-                for block_id in range(self.num_blocks):
-                    block_offset = block_id * self.block_len_per_layer[i]
-                    addr = base_addr + block_offset
-                    # Register addresses for V cache (K registered first).
-                    v_addr = addr + kv_block_len
-                    blocks_data.append((v_addr, kv_block_len, self.device_id))
-        logger.debug(
-            "Created %s blocks for src engine %s and rank %s on device id %s",
-            len(blocks_data),
-            self.engine_id,
-            self.tp_rank,
-            self.device_id,
-        )
-
-        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
-        # NIXL_INIT_AGENT to be used for preparations of local descs.
-        self.src_xfer_side_handle = self.nixl_wrapper.prep_xfer_dlist(
-            "NIXL_INIT_AGENT", descs
-        )
+        self.src_xfer_side_handles[self.block_size] = self.src_xfer_side_handle
 
         # TODO(mgoin): Hybrid memory allocator is currently disabled for
         # models with local attention (Llama 4). Can remove this once enabled.
@@ -1268,8 +1308,62 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
             kv_cache_layout=self.kv_cache_layout
             if not self.use_host_buffer
             else self.host_buffer_kv_cache_layout,
+            block_size=self.block_size,
+        )
+
+    def register_local_xfer_handler(
+        self,
+        block_size: int,
+    ) -> int:
+        """
+        Function used for register local xfer handler with local block_size or
+        Remote block_size.
+
+        When local block_size is same as remote block_size, we use local block_size
+        to register local_xfer_handler during init.
+
+        When remote block size is less than local block size, we need to use
+        register another local_xfer_handler using remote block len to ensure
+        data copy correctness.
+        """
+        block_size_ratio = self.block_size // block_size
+        blocks_data = []
+        for i, base_addr in enumerate(self.seen_base_addresses):
+            # The new block_len is using prefill block_len;
+            # and num_blocks is multiple with N
+            kv_block_len = (
+                self.get_backend_aware_kv_block_len(layer_idx=i) // block_size_ratio
+            )
+            block_len_per_layer = self.block_len_per_layer[i] // block_size_ratio
+            num_blocks = self.num_blocks * block_size_ratio
+            for block_id in range(num_blocks):
+                block_offset = block_id * block_len_per_layer
+                addr = base_addr + block_offset
+                # (addr, len, device id)
+                blocks_data.append((addr, kv_block_len, self.device_id))
+
+            if self.kv_topo.is_kv_layout_blocks_first:
+                # Separate and interleave K/V regions to maintain the same
+                # descs ordering. This is needed for selecting contiguous heads
+                # when split across TP ranks.
+                for block_id in range(num_blocks):
+                    block_offset = block_id * block_len_per_layer
+                    addr = base_addr + block_offset
+                    # Register addresses for V cache (K registered first).
+                    v_addr = addr + kv_block_len
+                    blocks_data.append((v_addr, kv_block_len, self.device_id))
+        logger.debug(
+            "Created %s blocks for src engine %s and rank %s on device id %s",
+            len(blocks_data),
+            self.engine_id,
+            self.tp_rank,
+            self.device_id,
         )
 
+        descs = self.nixl_wrapper.get_xfer_descs(blocks_data, self.nixl_memory_type)
+        # NIXL_INIT_AGENT to be used for preparations of local descs.
+        return self.nixl_wrapper.prep_xfer_dlist("NIXL_INIT_AGENT", descs)
+
     def add_remote_agent(
         self,
         nixl_agent_meta: NixlAgentMetadata,
@@ -1328,6 +1422,8 @@ def add_remote_agent(
         ### Register remote agent metadata
         if engine_id not in self._tp_size:
             self._tp_size[engine_id] = remote_tp_size
+        if engine_id not in self._block_size:
+            self._block_size[engine_id] = nixl_agent_meta.block_size
 
         remote_agent_name = self.nixl_wrapper.add_remote_agent(
             nixl_agent_meta.agent_metadata
@@ -1338,6 +1434,13 @@ def add_remote_agent(
 
         # Create dst descs and xfer side handles. TP workers have same #blocks
         # so we only register once per engine_id.
+        # Example:
+        # block_size_ratio > 1:
+        # remote:               | 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|
+        # local origin:|          0|          1|          8|         12|
+        # local mapped:| 0| 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(engine_id)
+
         if engine_id not in self.dst_num_blocks:
             self.dst_num_blocks[engine_id] = nixl_agent_meta.num_blocks
 
@@ -1360,8 +1463,14 @@ def add_remote_agent(
         # Register all remote blocks, but only the corresponding kv heads.
         for i, base_addr in enumerate(nixl_agent_meta.kv_caches_base_addr):
             kv_block_len = self.get_backend_aware_kv_block_len(layer_idx=i)
+            remote_kv_block_len = kv_block_len // block_size_ratio
+            if block_size_ratio > 1:
+                # using remote kv_block_len as transfer unit
+                kv_block_len = remote_kv_block_len
             rank_offset = (
-                self.tp_rank % tp_ratio * kv_block_len if not replicates_kv_cache else 0
+                self.tp_rank % tp_ratio * remote_kv_block_len
+                if not replicates_kv_cache
+                else 0
             )
             for block_id in range(nixl_agent_meta.num_blocks):
                 block_offset = block_id * nixl_agent_meta.block_lens[i]
@@ -1396,6 +1505,13 @@ def add_remote_agent(
             remote_agent_name, descs
         )
 
+        if block_size_ratio > 1:
+            # when prefill with smaller block_size, we need to init a
+            # new handler with same block_len to match
+            self.src_xfer_side_handles[nixl_agent_meta.block_size] = (
+                self.register_local_xfer_handler(nixl_agent_meta.block_size)
+            )
+
         return remote_agent_name
 
     def _validate_remote_agent_handshake(
@@ -1412,6 +1528,9 @@ def _validate_remote_agent_handshake(
         assert nixl_agent_meta.attn_backend_name == self.backend_name
 
         tp_ratio = self.kv_topo.tp_ratio_from_engine_id(remote_engine_id)
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+            remote_engine_id
+        )
         assert tp_ratio > 0, "Decode TP cannot be smaller than prefill TP"
         assert not self._use_pallas or tp_ratio == 1, (
             "TPU (pallas_v1) DOES NOT support heterogeneous TP yet."
@@ -1442,33 +1561,26 @@ def _validate_remote_agent_handshake(
         remote_block_len = nixl_agent_meta.block_lens[0]
         if self.use_mla or self.kv_topo.is_kv_replicated(remote_engine_id):
             # With replicated KV cache, only the number of blocks can differ.
-            assert self.block_len_per_layer == nixl_agent_meta.block_lens, (
-                "KV cache sizes must match between P and D when replicated"
-            )
-            remote_block_size = remote_block_len // (self.slot_size_per_layer[0])
+            for i in range(len(self.block_len_per_layer)):
+                assert (
+                    self.block_len_per_layer[i] // block_size_ratio
+                    == nixl_agent_meta.block_lens[i]
+                ), "KV cache sizes must match between P and D when replicated"
         else:
             # When MLA is not used, this is a list of the same block length
             for block_len in nixl_agent_meta.block_lens:
                 assert block_len == remote_block_len, (
                     "All remote layers must have the same block size"
                 )
-            remote_block_size = remote_block_len // (
-                self.slot_size_per_layer[0] * tp_ratio
-            )
-            if self.kv_topo.is_kv_layout_blocks_first:
-                # With flashinfer, KV are sent in the same message.
-                remote_block_size //= 2
 
-            assert remote_block_len == self.block_len_per_layer[0] * tp_ratio, (
+            assert (
+                remote_block_len
+                == (self.block_len_per_layer[0] * tp_ratio) // block_size_ratio
+            ), (
                 "Remote P worker KV layer cache must be of shape [2, N, "
                 "local_kv_heads*tp_ratio, block_size, head_dim] and same dtype."
             )
 
-        assert self.block_size == remote_block_size, (
-            "Remote P worker with different page/block size is not supported "
-            f"{self.block_size=}, {remote_block_size=}"
-        )
-
         # TP workers have same #blocks.
         assert self.dst_num_blocks[remote_engine_id] == nixl_agent_meta.num_blocks
 
@@ -1479,7 +1591,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
         assert self.use_host_buffer
         assert self.copy_blocks is not None
 
-        local_block_ids = meta.local_block_ids
+        local_block_ids = meta.local_physical_block_ids
         self.copy_blocks(
             self.host_xfer_buffers,
             self.device_kv_caches,
@@ -1492,7 +1604,7 @@ def sync_recved_kv_to_device(self, req_id: str, meta: ReqMeta):
                 "synced recved kv of request[%s] to device kv buffer,"
                 "local_block_ids: %s. ",
                 req_id,
-                ",".join(map(str, meta.local_block_ids)),
+                ",".join(map(str, local_block_ids)),
             )
 
     def save_kv_to_host(self, metadata: NixlConnectorMetadata):
@@ -1501,19 +1613,22 @@ def save_kv_to_host(self, metadata: NixlConnectorMetadata):
         assert self.copy_blocks is not None
 
         for req_id, meta in metadata.reqs_to_save.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
             if logger.isEnabledFor(logging.DEBUG):
                 logger.debug(
                     "save_load_kv for request[%s] to host xfer buffer."
                     "local_block_ids: %s. ",
                     req_id,
-                    ",".join(map(str, meta.local_block_ids)),
+                    ",".join(map(str, meta.local_physical_block_ids)),
                 )
             # blocking
             self.copy_blocks(
                 self.device_kv_caches,
                 self.host_xfer_buffers,
-                meta.local_block_ids,
-                meta.local_block_ids,
+                meta.local_physical_block_ids,
+                meta.local_physical_block_ids,
                 "d2h",
             )
 
@@ -1552,6 +1667,56 @@ def permute_device_kv(self, block_ids: list[int]):
                 )
                 cache.index_copy_(0, indices, permuted_blocks)
 
+    def blocksize_post_process(self, block_ids_per_ratio: dict[float, list[list[int]]]):
+        def _process_local_gt_remote(blocks_to_update, block_size_ratio):
+            n_kv_heads, block_size, head_size = blocks_to_update.shape[1:]
+            remote_block_size = block_size // block_size_ratio
+            n_blocks = block_size_ratio
+            # actual permute is to convert
+            # for local blocksize > remote blocksize
+            # ex: local blocksize = 16 tokens, remote blocksize = 4 tokens
+            # local block[0] = remote block[0, 1, 2, 3]
+            # remote is |h0-b0|h1-b0|h2-b0|h3-b0|h0-b1|h1-b1|h2-b1|h3-b1|...
+            # local is  |h0-b0..................|h1-b0..................|...
+            # permute is to:
+            # 1. view => view remote as n_blocks * remote_shape(H,remoteN,D)
+            # 2. permute => (H, nblocks, remoteN, D)
+            # 3. flatten => (H, localN, D)
+            permuted_blocks = (
+                blocks_to_update.reshape(
+                    -1, n_blocks, n_kv_heads, remote_block_size, head_size
+                )
+                .permute(0, 2, 1, 3, 4)
+                .flatten(2, 3)
+            )
+            return permuted_blocks
+
+        if len(self.device_kv_caches) == 0:
+            return
+        split_k_and_v = not (
+            self.use_mla or self._use_pallas or self.kv_topo.is_kv_layout_blocks_first
+        )
+        sample_cache = list(self.device_kv_caches.values())[0][0]
+        for block_size_ratio, block_ids_list in block_ids_per_ratio.items():
+            assert block_size_ratio > 1, "Only nP < nD supported currently."
+            block_ids_list = [[item for sublist in block_ids_list for item in sublist]]
+
+            for block_ids in block_ids_list:
+                indices = torch.tensor(block_ids, device=sample_cache.device)
+
+                for _, cache_or_caches in self.device_kv_caches.items():
+                    cache_list = cache_or_caches if split_k_and_v else [cache_or_caches]
+                    for cache in cache_list:
+                        blocks_to_update = cache.index_select(0, indices)
+                        # because kv_cache is always using original layout NHD as
+                        # virtual shape while stride can be either HND / NHD at
+                        # initialization.
+                        # we need to firstly get physical view of the tensor
+                        permuted_blocks = _process_local_gt_remote(
+                            blocks_to_update.permute(0, 2, 1, 3), block_size_ratio
+                        ).permute(0, 2, 1, 3)
+                        cache.index_copy_(0, indices, permuted_blocks)
+
     def get_finished(self) -> tuple[set[str], set[str]]:
         """
         Get requests that are done sending or recving on this specific worker.
@@ -1575,6 +1740,7 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             )
 
         block_ids_to_permute = []
+        block_ids_for_blocksize_post_process = defaultdict(list)
         for req_id in done_recving:
             # clean up metadata for completed requests
             meta = self._recving_metadata.pop(req_id, None)
@@ -1582,7 +1748,21 @@ def get_finished(self) -> tuple[set[str], set[str]]:
             if self.use_host_buffer:
                 self.sync_recved_kv_to_device(req_id, meta)
             if self.enable_permute_local_kv:
-                block_ids_to_permute += meta.local_block_ids
+                block_ids_to_permute += meta.local_physical_block_ids
+
+            # post processing for heteroblocksize
+            block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(
+                meta.remote_engine_id
+            )
+            if (
+                not self.use_mla
+                and block_size_ratio > 1
+                and self.kv_cache_layout == "HND"
+            ):
+                block_ids_for_blocksize_post_process[block_size_ratio].append(
+                    meta.local_block_ids
+                )
+        self.blocksize_post_process(block_ids_for_blocksize_post_process)
         if len(block_ids_to_permute) > 0:
             self.permute_device_kv(block_ids_to_permute)
 
@@ -1651,48 +1831,74 @@ def _pop_done_transfers(
         done_req_ids: set[str] = set()
         for req_id, handles in list(transfers.items()):
             in_progress = False
-            for handle, _xfer_stime in handles:
-                xfer_state = self.nixl_wrapper.check_xfer_state(handle)
-                if xfer_state == "DONE":
-                    # Get telemetry from NIXL
-                    res = self.nixl_wrapper.get_xfer_telemetry(handle)
-                    self.xfer_stats.record_transfer(res)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                elif xfer_state == "PROC":
-                    in_progress = True
-                    continue
-                else:
-                    # transfer failed - mark blocks as invalid
-                    logger.error(
-                        "NIXL transfer failed for request %s with state %s. "
+            for handle, xfer_start_time in handles:
+                try:
+                    xfer_state = self.nixl_wrapper.check_xfer_state(handle)
+                    if xfer_state == "DONE":
+                        # Get telemetry from NIXL
+                        res = self.nixl_wrapper.get_xfer_telemetry(handle)
+                        self.xfer_stats.record_transfer(res)
+                        self.nixl_wrapper.release_xfer_handle(handle)
+                    elif xfer_state == "PROC":
+                        in_progress = True
+                        continue
+                    else:
+                        logger.error(
+                            "NIXL transfer failed for request %s with state "
+                            "%s. Marking blocks as invalid.",
+                            req_id,
+                            xfer_state,
+                        )
+                        self._handle_failed_transfer(req_id, handle)
+                        in_progress = False
+                except Exception:
+                    logger.exception(
+                        "NIXL transfer exception for request %s. "
                         "Marking blocks as invalid.",
                         req_id,
-                        xfer_state,
                     )
-                    # mark all blocks for this request as invalid
-                    if meta := self._recving_metadata.pop(req_id, None):
-                        self._invalid_block_ids.update(meta.local_block_ids)
-                    self._recving_metadata.pop(req_id, None)
-                    self.nixl_wrapper.release_xfer_handle(handle)
-                    self.xfer_stats.record_failed_transfer()
+                    self._handle_failed_transfer(req_id, handle)
+                    in_progress = False
+
             if not in_progress:
                 done_req_ids.add(req_id)
                 del transfers[req_id]
         return done_req_ids
 
+    def _handle_failed_transfer(self, req_id: str, handle: int):
+        """
+        Handle a failed transfer by marking all (logical) blocks as invalid and
+        recording the failure.
+
+        Args:
+            req_id: The request ID.
+            handle: The transfer handle.
+        """
+        if meta := self._recving_metadata.pop(req_id, None):
+            self._invalid_block_ids.update(meta.local_block_ids)
+        self._recving_metadata.pop(req_id, None)
+        self.nixl_wrapper.release_xfer_handle(handle)
+        self.xfer_stats.record_failed_transfer()
+
     def start_load_kv(self, metadata: NixlConnectorMetadata):
         """
         Start loading by triggering non-blocking nixl_xfer.
         We check for these trnxs to complete in each step().
         """
         for req_id, meta in metadata.reqs_to_recv.items():
+            meta.local_physical_block_ids = self._logical_to_kernel_block_ids(
+                meta.local_block_ids
+            )
+            meta.remote_block_ids = self._logical_to_kernel_block_ids(
+                meta.remote_block_ids
+            )
             remote_engine_id = meta.remote_engine_id
             logger.debug(
                 "start_load_kv for request %s from remote engine %s. "
                 "Num local_block_ids: %s. Num remote_block_ids: %s. ",
                 req_id,
                 remote_engine_id,
-                len(meta.local_block_ids),
+                len(meta.local_physical_block_ids),
                 len(meta.remote_block_ids),
             )
             # always store metadata for failure recovery
@@ -1740,7 +1946,7 @@ def _read_blocks_for_req(self, req_id: str, meta: ReqMeta):
         self._read_blocks(
             request_id=req_id,
             dst_engine_id=meta.remote_engine_id,
-            local_block_ids=meta.local_block_ids,
+            local_block_ids=meta.local_physical_block_ids,
             remote_block_ids=meta.remote_block_ids,
         )
 
@@ -1751,6 +1957,24 @@ def _read_blocks(
         dst_engine_id: str,
         request_id: str,
     ):
+        block_size_ratio = self.kv_topo.block_size_ratio_from_engine_id(dst_engine_id)
+        if block_size_ratio > 1:
+            local_block_ids = self.get_mapped_blocks(
+                np.asarray(local_block_ids), block_size_ratio
+            )
+            if len(local_block_ids) > len(remote_block_ids):
+                # NOTE:
+                # get_mapped_blocks will always expand block_ids for n times.
+                # ex:
+                # prefill block_ids with block_size as 4:
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                # Local decode block_ids with block_size as 16: [1, 2, 3]
+                # expland ecode block_ids with get_mapped_blocks from [1, 2, 3] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+                # Then we clip local to align with prefill
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] to
+                # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+                local_block_ids = local_block_ids[: len(remote_block_ids)]
         # NOTE(rob): having the staging blocks be on the READER side is
         # not going to work well (since we will have to call rearrange tensors).
         # after we detect the txn is complete (which means we cannot make the
@@ -1793,7 +2017,10 @@ def _read_blocks(
             remote_block_ids = remote_block_ids[-num_local_blocks:]
 
         # Get side handles.
-        local_xfer_side_handle = self.src_xfer_side_handle
+        remote_block_size = self.kv_topo.remote_block_size[dst_engine_id]
+        local_xfer_side_handle = self.src_xfer_side_handles.get(
+            remote_block_size, self.src_xfer_side_handle
+        )
         remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id]
 
         # NOTE (nicolo) With homogeneous TP, each TP worker loads KV from
@@ -1803,13 +2030,17 @@ def _read_blocks(
         # Get descs ids.
         local_block_descs_ids: np.ndarray
         remote_block_descs_ids: np.ndarray
+
         if not self.block_window_per_layer:
             # Default case: assume global attention
             remote_block_descs_ids = self._get_block_descs_ids(
-                dst_engine_id, remote_block_ids
+                dst_engine_id,
+                remote_block_ids,
             )
             local_block_descs_ids = self._get_block_descs_ids(
-                self.engine_id, local_block_ids
+                self.engine_id,
+                local_block_ids,
+                block_size_ratio=block_size_ratio,
             )
         else:
             # TODO(mgoin): remove this once we have hybrid memory allocator
@@ -1830,10 +2061,15 @@ def _read_blocks(
 
                 # Get descs ids for the layer.
                 layer_local_desc_ids = self._get_block_descs_ids(
-                    self.engine_id, layer_local_block_ids, layer_idx
+                    dst_engine_id,
+                    layer_local_block_ids,
+                    layer_idx,
                 )
                 layer_remote_desc_ids = self._get_block_descs_ids(
-                    dst_engine_id, layer_remote_block_ids, layer_idx
+                    self.engine_id,
+                    layer_remote_block_ids,
+                    layer_idx,
+                    block_size_ratio=block_size_ratio,
                 )
 
                 local_descs_list.append(layer_local_desc_ids)
@@ -1867,7 +2103,7 @@ def _read_blocks(
                 "Marking blocks as invalid.",
                 request_id,
             )
-            # mark all blocks for this request as invalid
+            # mark all (logical) blocks for this request as invalid
             if meta := self._recving_metadata.get(request_id):
                 self._invalid_block_ids.update(meta.local_block_ids)
             self.xfer_stats.record_failed_transfer()
@@ -1875,8 +2111,31 @@ def _read_blocks(
                 self.nixl_wrapper.release_xfer_handle(handle)
             self._failed_recv_reqs.add(request_id)
 
+    def get_mapped_blocks(self, block_ids, block_size_ratio):
+        """
+          Calculates the new set of block IDs by mapping every element
+          in the (potentially sparse) input array.
+          Example: block_ids=[0, 2], block_size_ratio=2
+        get_mapped_blocks    0     1     [2     3]     4     5
+              # remote is |h0-b0|h1-b0||h0-b1|h1-b1||h0-b1|h1-b1||
+              # local is  |h0-b0......||h1-b0......||h2-b0........
+        local_block_ids         0           [1]           2
+        """
+        if block_ids.size == 0:
+            return np.array([], dtype=np.int64)
+
+        start_ids = block_ids * block_size_ratio
+        offsets = np.arange(block_size_ratio)
+        mapped_2d = start_ids[:, None] + offsets[None, :]
+
+        return mapped_2d.flatten().astype(np.int64)
+
     def _get_block_descs_ids(
-        self, engine_id: str, block_ids: list[int], layer_idx: int | None = None
+        self,
+        engine_id: str,
+        block_ids: list[int],
+        layer_idx: int | None = None,
+        block_size_ratio: float | None = None,
     ) -> np.ndarray:
         """
         Get the descs ids for a set of block ids.
@@ -1899,6 +2158,8 @@ def _get_block_descs_ids(
                 region_ids = np.arange(layer_idx, layer_idx + 1)
 
         num_blocks = self.dst_num_blocks[engine_id]
+        if block_size_ratio is not None:
+            num_blocks = int(num_blocks * block_size_ratio)
 
         # Compute the desc ids for each block.
         region_ids = region_ids[:, None]
@@ -1906,6 +2167,23 @@ def _get_block_descs_ids(
         descs_ids = region_ids * num_blocks + block_ids
         return descs_ids.flatten()
 
+    def _logical_to_kernel_block_ids(self, block_ids: list[int]) -> list[int]:
+        """
+        Convert logical block ids to kernel physical block ids.
+        This is required when the logical block size (the one set by the user)
+        does not match the one required by the attn backend.
+        """
+        if self._physical_blocks_per_logical_kv_block == 1:
+            # Noop when physical and logical block sizes are the same
+            return block_ids
+        block_ids_np = np.array(block_ids)
+        block_arange = np.arange(0, self._physical_blocks_per_logical_kv_block).reshape(
+            1, -1
+        )
+        return BlockTable.map_to_kernel_blocks(
+            block_ids_np, self._physical_blocks_per_logical_kv_block, block_arange
+        ).tolist()
+
     def get_backend_aware_kv_block_len(self, layer_idx: int):
         """
         Get the block length for one K/V element (K and V have the same size).
@@ -2067,9 +2345,9 @@ def reduce(self) -> dict[str, int | float]:
         return {
             "Num successful transfers": n,
             "Avg xfer time (ms)": round(xfer_time.mean() * 1e3, 3),
-            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90) * 1e3, 3),
+            "P90 xfer time (ms)": round(np.percentile(xfer_time, 90).item() * 1e3, 3),
             "Avg post time (ms)": round(post_time.mean() * 1e3, 3),
-            "P90 post time (ms)": round(np.percentile(post_time, 90) * 1e3, 3),
+            "P90 post time (ms)": round(np.percentile(post_time, 90).item() * 1e3, 3),
             "Avg MB per transfer": round(avg_mb, 3),
             "Throughput (MB/s)": round(throughput_mb_s, 3),
             "Avg number of descriptors": round(descs.mean(), 1),
@@ -2086,7 +2364,7 @@ def __init__(
         vllm_config: VllmConfig,
         metric_types: dict[type[PromMetric], type[PromMetricT]],
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         super().__init__(vllm_config, metric_types, labelnames, per_engine_labelvalues)
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
index 582e42cc466a..0ad9d4ae1b39 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/offloading_connector.py
@@ -4,12 +4,13 @@
 from collections.abc import Iterable, Iterator
 from dataclasses import dataclass
 from itertools import islice
-from typing import Any
+from typing import Any, ClassVar
 
 import torch
 
-from vllm.attention import AttentionMetadata
-from vllm.config import VllmConfig
+from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.distributed.kv_events import BlockRemoved, BlockStored, KVCacheEvent
 from vllm.distributed.kv_transfer.kv_connector.v1 import (
     KVConnectorBase_V1,
@@ -42,6 +43,8 @@ class OffloadingConnectorMetadata(KVConnectorMetadata):
 
 
 class OffloadingConnector(KVConnectorBase_V1):
+    prefer_cross_layer_blocks: ClassVar[bool] = True
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -63,6 +66,12 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
         assert self.connector_worker is not None
         self.connector_worker.register_kv_caches(kv_caches)
 
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        assert self.connector_worker is not None
+        self.connector_worker.register_cross_layers_kv_cache(kv_cache, attn_backend)
+
     def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None:
         assert self.connector_worker is not None
         assert isinstance(self._connector_metadata, OffloadingConnectorMetadata)
@@ -422,10 +431,35 @@ def _generate_job_id(self) -> int:
         self._job_counter = job_id + 1
         return job_id
 
-    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
-        for src_cls, dst_cls, handler in self.spec.get_handlers(kv_caches):
+    def _register_handlers(
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
+    ):
+        for src_cls, dst_cls, handler in self.spec.get_handlers(
+            kv_caches, attn_backends
+        ):
             self.worker.register_handler(src_cls, dst_cls, handler)
 
+    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
+        layer_names = list(kv_caches.keys())
+        layers = get_layers_from_vllm_config(
+            self.spec.vllm_config, Attention, layer_names
+        )
+        attn_backends = {
+            layer_name: layers[layer_name].get_attn_backend()
+            for layer_name in layer_names
+        }
+        self._register_handlers(kv_caches, attn_backends)
+
+    def register_cross_layers_kv_cache(
+        self, kv_cache: torch.Tensor, attn_backend: type[AttentionBackend]
+    ):
+        cross_layer_name = "ALL_LAYERS"
+        kv_caches = {cross_layer_name: kv_cache}
+        attn_backends = {cross_layer_name: attn_backend}
+        self._register_handlers(kv_caches, attn_backends)
+
     def start_load_kv(self, metadata: OffloadingConnectorMetadata):
         for req_id, transfer_spec in metadata.reqs_to_load.items():
             job_id = self._generate_job_id()
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
index a124a0d519db..8f3a62d7bcdb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -7,6 +7,7 @@
 import regex as re
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -22,7 +23,6 @@
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -243,7 +243,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
index 016d1d45b359..ed641cfc43dd 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import hashlib
 import os
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Optional
@@ -8,6 +7,7 @@
 import safetensors
 import torch
 
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import (
     KVConnectorBase_V1,
@@ -15,11 +15,11 @@
     KVConnectorRole,
 )
 from vllm.logger import init_logger
+from vllm.utils.hashing import safe_hash
 from vllm.v1.attention.backends.mla.common import MLACommonMetadata
 from vllm.v1.core.sched.output import SchedulerOutput
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
     from vllm.forward_context import ForwardContext
     from vllm.v1.core.kv_cache_manager import KVCacheBlocks
     from vllm.v1.kv_cache_interface import KVCacheConfig
@@ -211,7 +211,7 @@ def save_kv_layer(
         self,
         layer_name: str,
         kv_layer: torch.Tensor,
-        attn_metadata: "AttentionMetadata",
+        attn_metadata: AttentionMetadata,
         **kwargs: Any,
     ) -> None:
         """Start saving the KV cache of the layer from vLLM's paged buffer
@@ -423,7 +423,7 @@ def _generate_foldername_debug(
         if mm_hashes:
             mm_str = "-".join(mm_hashes)
             token_bytes += mm_str.encode("utf-8")
-        input_ids_hash = hashlib.md5(token_bytes, usedforsecurity=False).hexdigest()
+        input_ids_hash = safe_hash(token_bytes, usedforsecurity=False).hexdigest()
 
         foldername = os.path.join(self._storage_path, input_ids_hash)
         if create_folder:
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c78e6a32733c..c82a77c216af 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -41,7 +41,6 @@
 import torch.distributed._functional_collectives as funcol
 import torch.distributed._symmetric_memory
 from torch.distributed import Backend, ProcessGroup
-from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.base_device_communicator import (
@@ -51,6 +50,7 @@
 from vllm.logger import init_logger
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.network_utils import get_distributed_init_method
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import (
     direct_register_custom_op,
     supports_custom_op,
@@ -329,7 +329,8 @@ def __init__(
             )
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            with suppress_stdout():
+                cpu_group = torch.distributed.new_group(ranks, backend="gloo")
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
@@ -385,6 +386,33 @@ def __init__(
             torch.ops._C, "init_shm_manager"
         )
 
+    def create_mq_broadcaster(
+        self, writer_rank=0, external_writer_handle=None, blocking=True
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group(
+            self.cpu_group,
+            1 << 22,
+            6,
+            writer_rank=writer_rank,
+            external_writer_handle=external_writer_handle,
+            blocking=blocking,
+        )
+
+    def create_single_reader_mq_broadcasters(
+        self, reader_rank_in_group=0, blocking=False
+    ):
+        from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
+
+        return MessageQueue.create_from_process_group_single_reader(
+            self.cpu_group,
+            1 << 22,
+            6,
+            reader_rank=self.ranks[reader_rank_in_group],
+            blocking=blocking,
+        )
+
     @property
     def first_rank(self):
         """Return the global rank of the first process in the group"""
@@ -997,6 +1025,7 @@ def combine(
 
 
 _WORLD: GroupCoordinator | None = None
+_INNER_DP_WORLD: GroupCoordinator | None = None
 _NODE_COUNT: int | None = None
 
 
@@ -1005,6 +1034,11 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
+def get_inner_dp_world_group() -> GroupCoordinator:
+    assert _INNER_DP_WORLD is not None, "inner dp world group is not initialized"
+    return _INNER_DP_WORLD
+
+
 def init_world_group(
     ranks: list[int], local_rank: int, backend: str
 ) -> GroupCoordinator:
@@ -1023,12 +1057,13 @@ def init_model_parallel_group(
     backend: str,
     use_message_queue_broadcaster: bool = False,
     group_name: str | None = None,
+    use_device_communicator: bool = True,
 ) -> GroupCoordinator:
     return GroupCoordinator(
         group_ranks=group_ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
-        use_device_communicator=True,
+        use_device_communicator=use_device_communicator,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )
@@ -1042,15 +1077,6 @@ def get_tp_group() -> GroupCoordinator:
     return _TP
 
 
-@deprecated(
-    "`get_tensor_model_parallel_group` has been replaced with "
-    "`get_tp_group` and may be removed after v0.12. Please use "
-    "`get_tp_group` instead."
-)
-def get_tensor_model_parallel_group():
-    return get_tp_group()
-
-
 _DCP: GroupCoordinator | None = None
 
 
@@ -1064,6 +1090,12 @@ def get_dcp_group() -> GroupCoordinator:
 
 _PP: GroupCoordinator | None = None
 
+
+def get_pp_group() -> GroupCoordinator:
+    assert _PP is not None, "pipeline model parallel group is not initialized"
+    return _PP
+
+
 _DP: GroupCoordinator | None = None
 
 
@@ -1080,18 +1112,12 @@ def get_ep_group() -> GroupCoordinator:
     return _EP
 
 
-def get_pp_group() -> GroupCoordinator:
-    assert _PP is not None, "pipeline model parallel group is not initialized"
-    return _PP
+_PCP: GroupCoordinator | None = None
 
 
-@deprecated(
-    "`get_pipeline_model_parallel_group` has been replaced with "
-    "`get_pp_group` and may be removed in v0.12. Please use "
-    "`get_pp_group` instead."
-)
-def get_pipeline_model_parallel_group():
-    return get_pp_group()
+def get_pcp_group() -> GroupCoordinator:
+    assert _PCP is not None, "prefill context parallel group is not initialized"
+    return _PCP
 
 
 @contextmanager
@@ -1143,7 +1169,14 @@ def init_distributed_environment(
     from vllm.config import get_current_vllm_config
 
     config = get_current_vllm_config()
-    if (
+    if config is not None and config.parallel_config.nnodes > 1:
+        parallel_config = config.parallel_config
+        ip = parallel_config.master_addr
+        rank = parallel_config.data_parallel_rank * world_size + rank
+        world_size = parallel_config.world_size_across_dp
+        port = parallel_config.master_port
+        distributed_init_method = get_distributed_init_method(ip, port)
+    elif (
         config is not None
         and config.parallel_config.data_parallel_size > 1
         and config.parallel_config.distributed_executor_backend != "external_launcher"
@@ -1164,6 +1197,14 @@ def init_distributed_environment(
             distributed_init_method,
         )
     if not torch.distributed.is_initialized():
+        logger.info(
+            "world_size=%d rank=%d local_rank=%d distributed_init_method=%s backend=%s",
+            world_size,
+            rank,
+            local_rank,
+            distributed_init_method,
+            backend,
+        )
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
             "distributed environment"
@@ -1192,21 +1233,42 @@ def init_distributed_environment(
         # local rank not set, this usually happens in single-node
         # setting, where we can use rank as local rank
         local_rank = envs.LOCAL_RANK if distributed_init_method == "env://" else rank
-    global _WORLD, _NODE_COUNT
+    global _WORLD, _NODE_COUNT, _INNER_DP_WORLD
     if _WORLD is None:
         ranks = list(range(torch.distributed.get_world_size()))
         _WORLD = init_world_group(ranks, local_rank, backend)
-        _NODE_COUNT = _node_count(_WORLD.cpu_group)
+        if config.parallel_config.nnodes > 1:
+            _NODE_COUNT = config.parallel_config.nnodes
+        else:
+            _NODE_COUNT = _node_count(_WORLD.cpu_group)
         logger.debug("Detected %d nodes in the distributed environment", _NODE_COUNT)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size"
         )
+    if config.parallel_config.nnodes_within_dp > 1:
+        if parallel_config.data_parallel_size > 1:
+            world_size_inner_dp = parallel_config.world_size
+            group_ranks = [
+                [dp_rank * world_size_inner_dp + i for i in range(world_size_inner_dp)]
+                for dp_rank in range(parallel_config.data_parallel_size)
+            ]
+            _INNER_DP_WORLD = init_model_parallel_group(
+                group_ranks,
+                get_world_group().local_rank,
+                backend,
+                use_message_queue_broadcaster=True,
+                group_name="inner_dp_world",
+                use_device_communicator=False,
+            )
+        else:
+            _INNER_DP_WORLD = _WORLD
 
 
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
+    prefill_context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1256,7 +1318,11 @@ def initialize_model_parallel(
     # to get group_ranks for each dimension, transpose that dimension to the
     # last dimension, then reshape to 2D, then unbind the last dimension
     all_ranks = torch.arange(world_size).reshape(
-        -1, data_parallel_size, pipeline_model_parallel_size, tensor_model_parallel_size
+        -1,
+        data_parallel_size,
+        pipeline_model_parallel_size,
+        prefill_context_model_parallel_size,
+        tensor_model_parallel_size,
     )  # noqa
 
     # Build the tensor model-parallel groups.
@@ -1291,11 +1357,23 @@ def initialize_model_parallel(
         group_name="dcp",
     )
 
+    global _PCP
+    assert _PCP is None, "prefill context parallel group is already initialized"
+    group_ranks = (
+        all_ranks.transpose(3, 4)
+        .reshape(-1, prefill_context_model_parallel_size)
+        .unbind(0)
+    )
+    group_ranks = [x.tolist() for x in group_ranks]
+    _PCP = init_model_parallel_group(
+        group_ranks, get_world_group().local_rank, backend, group_name="pcp"
+    )
+
     # Build the pipeline model-parallel groups.
     global _PP
     assert _PP is None, "pipeline model parallel group is already initialized"
     group_ranks = (
-        all_ranks.transpose(2, 3).reshape(-1, pipeline_model_parallel_size).unbind(0)
+        all_ranks.transpose(2, 4).reshape(-1, pipeline_model_parallel_size).unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
     _PP = init_model_parallel_group(
@@ -1304,7 +1382,7 @@ def initialize_model_parallel(
 
     global _DP
     assert _DP is None, "data parallel group is already initialized"
-    group_ranks = all_ranks.transpose(1, 3).reshape(-1, data_parallel_size).unbind(0)
+    group_ranks = all_ranks.transpose(1, 4).reshape(-1, data_parallel_size).unbind(0)
     group_ranks = [x.tolist() for x in group_ranks]
     _DP = init_model_parallel_group(
         group_ranks, get_world_group().local_rank, backend, group_name="dp"
@@ -1314,7 +1392,12 @@ def initialize_model_parallel(
     assert _EP is None, "expert parallel group is already initialized"
     group_ranks = (
         all_ranks.transpose(1, 2)
-        .reshape(-1, data_parallel_size * tensor_model_parallel_size)
+        .reshape(
+            -1,
+            data_parallel_size
+            * prefill_context_model_parallel_size
+            * tensor_model_parallel_size,
+        )
         .unbind(0)
     )
     group_ranks = [x.tolist() for x in group_ranks]
@@ -1324,11 +1407,13 @@ def initialize_model_parallel(
 
     logger.info_once(
         "rank %s in world size %s is assigned as "
-        "DP rank %s, PP rank %s, TP rank %s, EP rank %s",
+        "DP rank %s, PP rank %s, PCP rank %s, "
+        "TP rank %s, EP rank %s",
         rank,
         world_size,
         _DP.rank_in_group,
         _PP.rank_in_group,
+        _PCP.rank_in_group,
         _TP.rank_in_group,
         _EP.rank_in_group,
     )
@@ -1337,6 +1422,7 @@ def initialize_model_parallel(
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
+    prefill_context_model_parallel_size: int = 1,
     decode_context_model_parallel_size: int | None = 1,
     backend: str | None = None,
 ) -> None:
@@ -1349,6 +1435,7 @@ def ensure_model_parallel_initialized(
         initialize_model_parallel(
             tensor_model_parallel_size,
             pipeline_model_parallel_size,
+            prefill_context_model_parallel_size,
             decode_context_model_parallel_size,
             backend,
         )
@@ -1365,6 +1452,12 @@ def ensure_model_parallel_initialized(
         f"got: {pp_world_size=} vs. "
         f"wanted: {pipeline_model_parallel_size=}"
     )
+    pcp_world_size = get_pcp_group().world_size
+    assert pcp_world_size == prefill_context_model_parallel_size, (
+        "prefill context parallel group already initialized, but of unexpected size: "
+        f"{pcp_world_size=} vs. "
+        f"{prefill_context_model_parallel_size=}"
+    )
 
 
 def prepare_communication_buffer_for_model(model: torch.nn.Module):
@@ -1376,6 +1469,8 @@ def prepare_communication_buffer_for_model(model: torch.nn.Module):
     """
     if _TP is not None:
         _TP.prepare_communication_buffer_for_model(model)
+    if _PCP is not None:
+        _PCP.prepare_communication_buffer_for_model(model)
     if _PP is not None:
         _PP.prepare_communication_buffer_for_model(model)
     if _DP is not None:
@@ -1451,16 +1546,21 @@ def destroy_model_parallel():
         _TP.destroy()
     _TP = None
 
-    global _PP
-    if _PP:
-        _PP.destroy()
-    _PP = None
-
     global _DCP
     if _DCP:
         _DCP.destroy()
     _DCP = None
 
+    global _PCP
+    if _PCP:
+        _PCP.destroy()
+    _PCP = None
+
+    global _PP
+    if _PP:
+        _PP.destroy()
+    _PP = None
+
     global _DP
     if _DP:
         _DP.destroy()
@@ -1483,7 +1583,7 @@ def destroy_distributed_environment():
 
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
-    # Ensure all objects are not freezed before cleanup
+    # Ensure all objects are not frozen before cleanup
     gc.unfreeze()
 
     destroy_model_parallel()
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index debf69c49b7d..242ce393e4dc 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -30,6 +30,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.utils.network_utils import get_tcp_uri
+from vllm.utils.system_utils import suppress_stdout
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 logger = init_logger(__name__)
@@ -427,33 +428,34 @@ def init_gloo_process_group(
     Stateless init ProcessGroup with gloo backend compatible with
     different torch versions.
     """
-    if is_torch_equal_or_newer("2.6"):
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-        )
-    else:
-        options = ProcessGroup.Options(backend="gloo")
-        pg = ProcessGroup(
-            prefix_store,
-            group_rank,
-            group_size,
-            options,
-        )
-    from torch.distributed.distributed_c10d import ProcessGroupGloo
+    with suppress_stdout():
+        if is_torch_equal_or_newer("2.6"):
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+            )
+        else:
+            options = ProcessGroup.Options(backend="gloo")
+            pg = ProcessGroup(
+                prefix_store,
+                group_rank,
+                group_size,
+                options,
+            )
+        from torch.distributed.distributed_c10d import ProcessGroupGloo
 
-    backend_class = ProcessGroupGloo(
-        prefix_store, group_rank, group_size, timeout=timeout
-    )
-    backend_type = ProcessGroup.BackendType.GLOO
-    device = torch.device("cpu")
-    if is_torch_equal_or_newer("2.6"):
-        # _set_default_backend is supported in torch >= 2.6
-        pg._set_default_backend(backend_type)
-    backend_class._set_sequence_number_for_group()
-
-    pg._register_backend(device, backend_type, backend_class)
+        backend_class = ProcessGroupGloo(
+            prefix_store, group_rank, group_size, timeout=timeout
+        )
+        backend_type = ProcessGroup.BackendType.GLOO
+        device = torch.device("cpu")
+        if is_torch_equal_or_newer("2.6"):
+            # _set_default_backend is supported in torch >= 2.6
+            pg._set_default_backend(backend_type)
+        backend_class._set_sequence_number_for_group()
+
+        pg._register_backend(device, backend_type, backend_class)
     return pg
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 13c7704f5bf3..96b1b971552c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -29,7 +29,7 @@
 import torch
 from pydantic import TypeAdapter, ValidationError
 from pydantic.fields import FieldInfo
-from typing_extensions import TypeIs, deprecated
+from typing_extensions import TypeIs
 
 import vllm.envs as envs
 from vllm.attention.backends.registry import AttentionBackendEnum
@@ -77,16 +77,17 @@
 from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
 from vllm.config.scheduler import SchedulerPolicy
 from vllm.config.utils import get_field
-from vllm.logger import init_logger
+from vllm.config.vllm import OptimizationLevel
+from vllm.logger import init_logger, suppress_logging
 from vllm.platforms import CpuArchEnum, current_platform
 from vllm.plugins import load_general_plugins
 from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
 from vllm.transformers_utils.config import (
-    get_model_path,
     is_interleaved,
     maybe_override_with_speculators,
 )
-from vllm.transformers_utils.utils import check_gguf_file, is_cloud_storage
+from vllm.transformers_utils.repo_utils import get_model_path
+from vllm.transformers_utils.utils import is_cloud_storage, is_gguf
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.network_utils import get_ip
@@ -247,11 +248,13 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
             default = field.default
             # Handle pydantic.Field defaults
             if isinstance(default, FieldInfo):
-                default = (
-                    default.default
-                    if default.default_factory is None
-                    else default.default_factory()
-                )
+                if default.default_factory is None:
+                    default = default.default
+                else:
+                    # VllmConfig's Fields have default_factory set to config classes.
+                    # These could emit logs on init, which would be confusing.
+                    with suppress_logging():
+                        default = default.default_factory()
         elif field.default_factory is not MISSING:
             default = field.default_factory()
 
@@ -357,7 +360,7 @@ class EngineArgs:
     task: TaskOption | None = ModelConfig.task
     skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
     enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
-    tokenizer_mode: TokenizerMode = ModelConfig.tokenizer_mode
+    tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
     trust_remote_code: bool = ModelConfig.trust_remote_code
     allowed_local_media_path: str = ModelConfig.allowed_local_media_path
     allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
@@ -367,7 +370,7 @@ class EngineArgs:
     config_format: str = ModelConfig.config_format
     dtype: ModelDType = ModelConfig.dtype
     kv_cache_dtype: CacheDType = CacheConfig.cache_dtype
-    seed: int | None = ModelConfig.seed
+    seed: int | None = 0
     max_model_len: int | None = ModelConfig.max_model_len
     cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes
     cudagraph_capture_sizes: list[int] | None = (
@@ -384,9 +387,15 @@ class EngineArgs:
     ) = ParallelConfig.distributed_executor_backend
     # number of P/D disaggregation (or other disaggregation) workers
     pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
+    master_addr: str = ParallelConfig.master_addr
+    master_port: int = ParallelConfig.master_port
+    nnodes: int = ParallelConfig.nnodes
+    node_rank: int = ParallelConfig.node_rank
     tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
+    prefill_context_parallel_size: int = ParallelConfig.prefill_context_parallel_size
     decode_context_parallel_size: int = ParallelConfig.decode_context_parallel_size
     dcp_kv_cache_interleave_size: int = ParallelConfig.dcp_kv_cache_interleave_size
+    cp_kv_cache_interleave_size: int = ParallelConfig.cp_kv_cache_interleave_size
     data_parallel_size: int = ParallelConfig.data_parallel_size
     data_parallel_rank: int | None = None
     data_parallel_start_rank: int | None = None
@@ -394,6 +403,7 @@ class EngineArgs:
     data_parallel_address: str | None = None
     data_parallel_rpc_port: int | None = None
     data_parallel_hybrid_lb: bool = False
+    data_parallel_external_lb: bool = False
     data_parallel_backend: str = ParallelConfig.data_parallel_backend
     enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
     all2all_backend: str | None = ParallelConfig.all2all_backend
@@ -418,7 +428,7 @@ class EngineArgs:
         ParallelConfig.max_parallel_loading_workers
     )
     block_size: BlockSize | None = CacheConfig.block_size
-    enable_prefix_caching: bool | None = CacheConfig.enable_prefix_caching
+    enable_prefix_caching: bool | None = None
     prefix_caching_hash_algo: PrefixCachingHashAlgo = (
         CacheConfig.prefix_caching_hash_algo
     )
@@ -428,11 +438,11 @@ class EngineArgs:
     cpu_offload_gb: float = CacheConfig.cpu_offload_gb
     gpu_memory_utilization: float = CacheConfig.gpu_memory_utilization
     kv_cache_memory_bytes: int | None = CacheConfig.kv_cache_memory_bytes
-    max_num_batched_tokens: int | None = SchedulerConfig.max_num_batched_tokens
+    max_num_batched_tokens: int | None = None
     max_num_partial_prefills: int = SchedulerConfig.max_num_partial_prefills
     max_long_partial_prefills: int = SchedulerConfig.max_long_partial_prefills
     long_prefill_token_threshold: int = SchedulerConfig.long_prefill_token_threshold
-    max_num_seqs: int | None = SchedulerConfig.max_num_seqs
+    max_num_seqs: int | None = None
     max_logprobs: int = ModelConfig.max_logprobs
     logprobs_mode: LogprobsMode = ModelConfig.logprobs_mode
     disable_log_stats: bool = False
@@ -477,15 +487,13 @@ class EngineArgs:
     fully_sharded_loras: bool = LoRAConfig.fully_sharded_loras
     max_cpu_loras: int | None = LoRAConfig.max_cpu_loras
     lora_dtype: str | torch.dtype | None = LoRAConfig.lora_dtype
-    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
 
     ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
     num_gpu_blocks_override: int | None = CacheConfig.num_gpu_blocks_override
-    num_lookahead_slots: int = SchedulerConfig.num_lookahead_slots
     model_loader_extra_config: dict = get_field(LoadConfig, "model_loader_extra_config")
     ignore_patterns: str | list[str] = get_field(LoadConfig, "ignore_patterns")
 
-    enable_chunked_prefill: bool | None = SchedulerConfig.enable_chunked_prefill
+    enable_chunked_prefill: bool | None = None
     disable_chunked_mm_input: bool = SchedulerConfig.disable_chunked_mm_input
 
     disable_hybrid_kv_cache_manager: bool = (
@@ -497,11 +505,6 @@ class EngineArgs:
     )
     reasoning_parser: str = StructuredOutputsConfig.reasoning_parser
     reasoning_parser_plugin: str | None = None
-    # Deprecated guided decoding fields
-    guided_decoding_backend: str | None = None
-    guided_decoding_disable_fallback: bool | None = None
-    guided_decoding_disable_any_whitespace: bool | None = None
-    guided_decoding_disable_additional_properties: bool | None = None
 
     logits_processor_pattern: str | None = ModelConfig.logits_processor_pattern
 
@@ -518,9 +521,6 @@ class EngineArgs:
     scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls
 
     pooler_config: PoolerConfig | None = ModelConfig.pooler_config
-    override_pooler_config: dict | PoolerConfig | None = (
-        ModelConfig.override_pooler_config
-    )
     compilation_config: CompilationConfig = get_field(VllmConfig, "compilation_config")
     worker_cls: str = ParallelConfig.worker_cls
     worker_extension_cls: str = ParallelConfig.worker_extension_cls
@@ -558,12 +558,16 @@ class EngineArgs:
 
     async_scheduling: bool | None = SchedulerConfig.async_scheduling
 
+    stream_interval: int = SchedulerConfig.stream_interval
+
     kv_sharing_fast_prefill: bool = CacheConfig.kv_sharing_fast_prefill
+    optimization_level: OptimizationLevel = VllmConfig.optimization_level
 
     kv_offloading_size: float | None = CacheConfig.kv_offloading_size
     kv_offloading_backend: KVOffloadingBackend | None = (
         CacheConfig.kv_offloading_backend
     )
+    tokens_only: bool = False
 
     def __post_init__(self):
         # support `EngineArgs(compilation_config={...})`
@@ -577,15 +581,26 @@ def __post_init__(self):
         from vllm.plugins import load_general_plugins
 
         load_general_plugins()
-        # when use hf offline,replace model id to local model path
+        # when use hf offline,replace model and tokenizer id to local model path
         if huggingface_hub.constants.HF_HUB_OFFLINE:
             model_id = self.model
             self.model = get_model_path(self.model, self.revision)
-            logger.info(
-                "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
-                model_id,
-                self.model,
-            )
+            if model_id is not self.model:
+                logger.info(
+                    "HF_HUB_OFFLINE is True, replace model_id [%s] to model_path [%s]",
+                    model_id,
+                    self.model,
+                )
+            if self.tokenizer is not None:
+                tokenizer_id = self.tokenizer
+                self.tokenizer = get_model_path(self.tokenizer, self.tokenizer_revision)
+                if tokenizer_id is not self.tokenizer:
+                    logger.info(
+                        "HF_HUB_OFFLINE is True, replace tokenizer_id [%s] "
+                        "to tokenizer_path [%s]",
+                        tokenizer_id,
+                        self.tokenizer,
+                    )
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
@@ -654,11 +669,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         model_group.add_argument("--hf-overrides", **model_kwargs["hf_overrides"])
         model_group.add_argument("--pooler-config", **model_kwargs["pooler_config"])
-        model_group.add_argument(
-            "--override-pooler-config",
-            **model_kwargs["override_pooler_config"],
-            deprecated=True,
-        )
         model_group.add_argument(
             "--logits-processor-pattern", **model_kwargs["logits_processor_pattern"]
         )
@@ -717,19 +727,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--reasoning-parser-plugin",
             **structured_outputs_kwargs["reasoning_parser_plugin"],
         )
-        # Deprecated guided decoding arguments
-        for arg, type in [
-            ("--guided-decoding-backend", str),
-            ("--guided-decoding-disable-fallback", bool),
-            ("--guided-decoding-disable-any-whitespace", bool),
-            ("--guided-decoding-disable-additional-properties", bool),
-        ]:
-            structured_outputs_group.add_argument(
-                arg,
-                type=type,
-                help=(f"[DEPRECATED] {arg} will be removed in v0.12.0."),
-                deprecated=True,
-            )
 
         # Parallel arguments
         parallel_kwargs = get_kwargs(ParallelConfig)
@@ -746,6 +743,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "-pp",
             **parallel_kwargs["pipeline_parallel_size"],
         )
+        parallel_group.add_argument("--master-addr", **parallel_kwargs["master_addr"])
+        parallel_group.add_argument("--master-port", **parallel_kwargs["master_port"])
+        parallel_group.add_argument("--nnodes", "-n", **parallel_kwargs["nnodes"])
+        parallel_group.add_argument("--node-rank", "-r", **parallel_kwargs["node_rank"])
         parallel_group.add_argument(
             "--tensor-parallel-size", "-tp", **parallel_kwargs["tensor_parallel_size"]
         )
@@ -758,6 +759,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--dcp-kv-cache-interleave-size",
             **parallel_kwargs["dcp_kv_cache_interleave_size"],
         )
+        parallel_group.add_argument(
+            "--cp-kv-cache-interleave-size",
+            **parallel_kwargs["cp_kv_cache_interleave_size"],
+        )
+        parallel_group.add_argument(
+            "--prefill-context-parallel-size",
+            "-pcp",
+            **parallel_kwargs["prefill_context_parallel_size"],
+        )
         parallel_group.add_argument(
             "--data-parallel-size", "-dp", **parallel_kwargs["data_parallel_size"]
         )
@@ -800,7 +810,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help='Backend for data parallel, either "mp" or "ray".',
         )
         parallel_group.add_argument(
-            "--data-parallel-hybrid-lb", **parallel_kwargs["data_parallel_hybrid_lb"]
+            "--data-parallel-hybrid-lb",
+            "-dph",
+            **parallel_kwargs["data_parallel_hybrid_lb"],
+        )
+        parallel_group.add_argument(
+            "--data-parallel-external-lb",
+            "-dpe",
+            **parallel_kwargs["data_parallel_external_lb"],
         )
         parallel_group.add_argument(
             "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"]
@@ -827,30 +844,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--expert-placement-strategy",
             **parallel_kwargs["expert_placement_strategy"],
         )
-        parallel_group.add_argument(
-            "--num-redundant-experts",
-            type=int,
-            help="[DEPRECATED] --num-redundant-experts will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-window-size",
-            type=int,
-            help="[DEPRECATED] --eplb-window-size will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-step-interval",
-            type=int,
-            help="[DEPRECATED] --eplb-step-interval will be removed in v0.12.0.",
-            deprecated=True,
-        )
-        parallel_group.add_argument(
-            "--eplb-log-balancedness",
-            action=argparse.BooleanOptionalAction,
-            help="[DEPRECATED] --eplb-log-balancedness will be removed in v0.12.0.",
-            deprecated=True,
-        )
 
         parallel_group.add_argument(
             "--max-parallel-loading-workers",
@@ -892,7 +885,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--num-gpu-blocks-override", **cache_kwargs["num_gpu_blocks_override"]
         )
         cache_group.add_argument(
-            "--enable-prefix-caching", **cache_kwargs["enable_prefix_caching"]
+            "--enable-prefix-caching",
+            **{
+                **cache_kwargs["enable_prefix_caching"],
+                "default": None,
+            },
         )
         cache_group.add_argument(
             "--prefix-caching-hash-algo", **cache_kwargs["prefix_caching_hash_algo"]
@@ -982,9 +979,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         lora_group.add_argument("--max-loras", **lora_kwargs["max_loras"])
         lora_group.add_argument("--max-lora-rank", **lora_kwargs["max_lora_rank"])
-        lora_group.add_argument(
-            "--lora-extra-vocab-size", **lora_kwargs["lora_extra_vocab_size"]
-        )
         lora_group.add_argument(
             "--lora-dtype",
             **lora_kwargs["lora_dtype"],
@@ -1027,10 +1021,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             description=SchedulerConfig.__doc__,
         )
         scheduler_group.add_argument(
-            "--max-num-batched-tokens", **scheduler_kwargs["max_num_batched_tokens"]
+            "--max-num-batched-tokens",
+            **{
+                **scheduler_kwargs["max_num_batched_tokens"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
-            "--max-num-seqs", **scheduler_kwargs["max_num_seqs"]
+            "--max-num-seqs",
+            **{
+                **scheduler_kwargs["max_num_seqs"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
             "--max-num-partial-prefills", **scheduler_kwargs["max_num_partial_prefills"]
@@ -1043,16 +1045,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--long-prefill-token-threshold",
             **scheduler_kwargs["long_prefill_token_threshold"],
         )
-        scheduler_group.add_argument(
-            "--num-lookahead-slots", **scheduler_kwargs["num_lookahead_slots"]
-        )
         # multi-step scheduling has been removed; corresponding arguments
         # are no longer supported.
         scheduler_group.add_argument(
             "--scheduling-policy", **scheduler_kwargs["policy"]
         )
         scheduler_group.add_argument(
-            "--enable-chunked-prefill", **scheduler_kwargs["enable_chunked_prefill"]
+            "--enable-chunked-prefill",
+            **{
+                **scheduler_kwargs["enable_chunked_prefill"],
+                "default": None,
+            },
         )
         scheduler_group.add_argument(
             "--disable-chunked-mm-input", **scheduler_kwargs["disable_chunked_mm_input"]
@@ -1067,6 +1070,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         scheduler_group.add_argument(
             "--async-scheduling", **scheduler_kwargs["async_scheduling"]
         )
+        scheduler_group.add_argument(
+            "--stream-interval", **scheduler_kwargs["stream_interval"]
+        )
 
         # Compilation arguments
         compilation_kwargs = get_kwargs(CompilationConfig)
@@ -1112,7 +1118,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--ec-transfer-config", **vllm_kwargs["ec_transfer_config"]
         )
         vllm_group.add_argument(
-            "--compilation-config", "-O", **vllm_kwargs["compilation_config"]
+            "--compilation-config", "-cc", **vllm_kwargs["compilation_config"]
         )
         vllm_group.add_argument(
             "--additional-config", **vllm_kwargs["additional_config"]
@@ -1121,6 +1127,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"]
         )
 
+        vllm_group.add_argument(
+            "--optimization-level", **vllm_kwargs["optimization_level"]
+        )
+
         # Other arguments
         parser.add_argument(
             "--disable-log-stats",
@@ -1147,33 +1157,56 @@ def from_cli_args(cls, args: argparse.Namespace):
         return engine_args
 
     def create_model_config(self) -> ModelConfig:
-        # gguf file needs a specific model loader and doesn't use hf_repo
-        if check_gguf_file(self.model):
+        # gguf file needs a specific model loader
+        if is_gguf(self.model):
             self.quantization = self.load_format = "gguf"
 
+        # NOTE(woosuk): In V1, we use separate processes for workers (unless
+        # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
+        # doesn't affect the user process.
+        if self.seed is None:
+            logger.warning_once(
+                "`seed=None` is equivalent to `seed=0` in V1 Engine. "
+                "You will no longer be allowed to pass `None` in v0.13.",
+                scope="local",
+            )
+
+            self.seed = 0
+            if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
+                logger.warning(
+                    "The global random seed is set to %d. Since "
+                    "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may "
+                    "affect the random state of the Python process that "
+                    "launched vLLM.",
+                    self.seed,
+                )
+
         if self.disable_mm_preprocessor_cache:
-            logger.warning(
+            logger.warning_once(
                 "`--disable-mm-preprocessor-cache` is deprecated "
                 "and will be removed in v0.13. "
                 "Please use `--mm-processor-cache-gb 0` instead.",
+                scope="local",
             )
 
             self.mm_processor_cache_gb = 0
         elif envs.VLLM_MM_INPUT_CACHE_GIB != 4:
-            logger.warning(
+            logger.warning_once(
                 "VLLM_MM_INPUT_CACHE_GIB` is deprecated "
                 "and will be removed in v0.13. "
                 "Please use `--mm-processor-cache-gb %d` instead.",
                 envs.VLLM_MM_INPUT_CACHE_GIB,
+                scope="local",
             )
 
             self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB
 
         if self.enable_multimodal_encoder_data_parallel:
-            logger.warning(
+            logger.warning_once(
                 "--enable-multimodal-encoder-data-parallel` is deprecated "
                 "and will be removed in v0.13. "
-                "Please use `--mm-encoder-tp-mode data` instead."
+                "Please use `--mm-encoder-tp-mode data` instead.",
+                scope="local",
             )
 
             self.mm_encoder_tp_mode = "data"
@@ -1219,7 +1252,6 @@ def create_model_config(self) -> ModelConfig:
             mm_encoder_tp_mode=self.mm_encoder_tp_mode,
             mm_encoder_attn_backend=self.mm_encoder_attn_backend,
             pooler_config=self.pooler_config,
-            override_pooler_config=self.override_pooler_config,
             logits_processor_pattern=self.logits_processor_pattern,
             generation_config=self.generation_config,
             override_generation_config=self.override_generation_config,
@@ -1328,31 +1360,10 @@ def create_engine_config(
         self.tokenizer = model_config.tokenizer
 
         self._check_feature_supported(model_config)
-
-        # Set default arguments for V1 Engine.
-        self._set_default_args(usage_context, model_config)
-        # Disable chunked prefill and prefix caching for:
-        # POWER (ppc64le)/ARM/s390x/RISCV CPUs in V1
-        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
-            CpuArchEnum.POWERPC,
-            CpuArchEnum.S390X,
-            CpuArchEnum.ARM,
-            CpuArchEnum.RISCV,
-        ):
-            logger.info(
-                "Chunked prefill is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_chunked_prefill = False
-            logger.info(
-                "Prefix caching is not supported for ARM and POWER, "
-                "S390X and RISC-V CPUs; "
-                "disabling it for V1 backend."
-            )
-            self.enable_prefix_caching = False
-
-        assert self.enable_chunked_prefill is not None
+        self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
+        self._set_default_max_num_seqs_and_batched_tokens_args(
+            usage_context, model_config
+        )
 
         sliding_window: int | None = None
         if not is_interleaved(model_config.hf_text_config):
@@ -1422,12 +1433,56 @@ def create_engine_config(
         assert not headless or not self.data_parallel_hybrid_lb, (
             "data_parallel_hybrid_lb is not applicable in headless mode"
         )
-
-        data_parallel_external_lb = self.data_parallel_rank is not None
+        assert not (self.data_parallel_hybrid_lb and self.data_parallel_external_lb), (
+            "data_parallel_hybrid_lb and data_parallel_external_lb cannot both be True."
+        )
+        assert self.data_parallel_backend == "mp" or self.nnodes == 1, (
+            "nnodes > 1 is only supported with data_parallel_backend=mp"
+        )
+        inferred_data_parallel_rank = 0
+        if self.nnodes > 1:
+            world_size = (
+                self.data_parallel_size
+                * self.pipeline_parallel_size
+                * self.tensor_parallel_size
+            )
+            world_size_within_dp = (
+                self.pipeline_parallel_size * self.tensor_parallel_size
+            )
+            local_world_size = world_size // self.nnodes
+            assert world_size % self.nnodes == 0, (
+                f"world_size={world_size} must be divisible by nnodes={self.nnodes}."
+            )
+            assert self.node_rank < self.nnodes, (
+                f"node_rank={self.node_rank} must be less than nnodes={self.nnodes}."
+            )
+            inferred_data_parallel_rank = (
+                self.node_rank * local_world_size
+            ) // world_size_within_dp
+            if self.data_parallel_size > 1 and self.data_parallel_external_lb:
+                self.data_parallel_rank = inferred_data_parallel_rank
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d for external lb",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
+            elif self.data_parallel_size_local is None:
+                # Infer data parallel size local for internal dplb:
+                self.data_parallel_size_local = max(
+                    local_world_size // world_size_within_dp, 1
+                )
+        data_parallel_external_lb = (
+            self.data_parallel_external_lb or self.data_parallel_rank is not None
+        )
         # Local DP rank = 1, use pure-external LB.
         if data_parallel_external_lb:
+            assert self.data_parallel_rank is not None, (
+                "data_parallel_rank or node_rank must be specified if "
+                "data_parallel_external_lb is enable."
+            )
             assert self.data_parallel_size_local in (1, None), (
-                "data_parallel_size_local must be 1 when data_parallel_rank is set"
+                "data_parallel_size_local must be 1 or None when data_parallel_rank "
+                "is set"
             )
             data_parallel_size_local = 1
             # Use full external lb if we have local_size of 1.
@@ -1441,6 +1496,11 @@ def create_engine_config(
 
             if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
                 # Use full external lb if we have local_size of 1.
+                logger.warning(
+                    "data_parallel_hybrid_lb is not eligible when "
+                    "data_parallel_size_local = 1, autoswitch to "
+                    "data_parallel_external_lb."
+                )
                 data_parallel_external_lb = True
                 self.data_parallel_hybrid_lb = False
 
@@ -1448,7 +1508,15 @@ def create_engine_config(
                 # Disable hybrid LB mode if set for a single node
                 self.data_parallel_hybrid_lb = False
 
-            self.data_parallel_rank = self.data_parallel_start_rank or 0
+            self.data_parallel_rank = (
+                self.data_parallel_start_rank or inferred_data_parallel_rank
+            )
+            if self.nnodes > 1:
+                logger.info(
+                    "Inferred data_parallel_rank %d from node_rank %d",
+                    self.data_parallel_rank,
+                    self.node_rank,
+                )
         else:
             assert not self.data_parallel_hybrid_lb, (
                 "data_parallel_size_local must be set to use data_parallel_hybrid_lb."
@@ -1478,7 +1546,9 @@ def create_engine_config(
                     "data_parallel_backend can only be ray or mp, got %s",
                     self.data_parallel_backend,
                 )
-                data_parallel_address = ParallelConfig.data_parallel_master_ip
+                data_parallel_address = (
+                    self.master_addr or ParallelConfig.data_parallel_master_ip
+                )
         else:
             data_parallel_address = self.data_parallel_address
 
@@ -1490,6 +1560,16 @@ def create_engine_config(
             else ParallelConfig.data_parallel_rpc_port
         )
 
+        if self.tokens_only and not model_config.skip_tokenizer_init:
+            model_config.skip_tokenizer_init = True
+            logger.info("Skipping tokenizer initialization for tokens-only mode.")
+
+        if self.async_scheduling and not self.disable_nccl_for_dp_synchronization:
+            logger.info(
+                "Disabling NCCL for DP synchronization when using async scheduling."
+            )
+            self.disable_nccl_for_dp_synchronization = True
+
         # Forward the deprecated CLI args to the EPLB config.
         if self.num_redundant_experts is not None:
             self.eplb_config.num_redundant_experts = self.num_redundant_experts
@@ -1503,10 +1583,15 @@ def create_engine_config(
         parallel_config = ParallelConfig(
             pipeline_parallel_size=self.pipeline_parallel_size,
             tensor_parallel_size=self.tensor_parallel_size,
+            prefill_context_parallel_size=self.prefill_context_parallel_size,
             data_parallel_size=self.data_parallel_size,
             data_parallel_rank=self.data_parallel_rank or 0,
             data_parallel_external_lb=data_parallel_external_lb,
             data_parallel_size_local=data_parallel_size_local,
+            master_addr=self.master_addr,
+            master_port=self.master_port,
+            nnodes=self.nnodes,
+            node_rank=self.node_rank,
             data_parallel_master_ip=data_parallel_address,
             data_parallel_rpc_port=data_parallel_rpc_port,
             data_parallel_backend=self.data_parallel_backend,
@@ -1530,6 +1615,7 @@ def create_engine_config(
             worker_extension_cls=self.worker_extension_cls,
             decode_context_parallel_size=self.decode_context_parallel_size,
             dcp_kv_cache_interleave_size=self.dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=self.cp_kv_cache_interleave_size,
             _api_process_count=self._api_process_count,
             _api_process_rank=self._api_process_rank,
         )
@@ -1539,18 +1625,11 @@ def create_engine_config(
             target_parallel_config=parallel_config,
         )
 
-        # make sure num_lookahead_slots is set appropriately depending on
-        # whether speculative decoding is enabled
-        num_lookahead_slots = self.num_lookahead_slots
-        if speculative_config is not None:
-            num_lookahead_slots = speculative_config.num_lookahead_slots
-
         scheduler_config = SchedulerConfig(
             runner_type=model_config.runner_type,
             max_num_batched_tokens=self.max_num_batched_tokens,
             max_num_seqs=self.max_num_seqs,
             max_model_len=model_config.max_model_len,
-            num_lookahead_slots=num_lookahead_slots,
             enable_chunked_prefill=self.enable_chunked_prefill,
             disable_chunked_mm_input=self.disable_chunked_mm_input,
             is_multimodal_model=model_config.is_multimodal_model,
@@ -1562,6 +1641,7 @@ def create_engine_config(
             long_prefill_token_threshold=self.long_prefill_token_threshold,
             disable_hybrid_kv_cache_manager=self.disable_hybrid_kv_cache_manager,
             async_scheduling=self.async_scheduling,
+            stream_interval=self.stream_interval,
         )
 
         if not model_config.is_multimodal_model and self.default_mm_loras:
@@ -1576,7 +1656,6 @@ def create_engine_config(
                 max_loras=self.max_loras,
                 default_mm_loras=self.default_mm_loras,
                 fully_sharded_loras=self.fully_sharded_loras,
-                lora_extra_vocab_size=self.lora_extra_vocab_size,
                 lora_dtype=self.lora_dtype,
                 max_cpu_loras=self.max_cpu_loras
                 if self.max_cpu_loras and self.max_cpu_loras > 0
@@ -1615,59 +1694,42 @@ def create_engine_config(
                 self.reasoning_parser_plugin
             )
 
-        # Forward the deprecated CLI args to the StructuredOutputsConfig
-        so_config = self.structured_outputs_config
-        if self.guided_decoding_backend is not None:
-            so_config.guided_decoding_backend = self.guided_decoding_backend
-        if self.guided_decoding_disable_fallback is not None:
-            so_config.disable_fallback = self.guided_decoding_disable_fallback
-        if self.guided_decoding_disable_any_whitespace is not None:
-            so_config.disable_any_whitespace = (
-                self.guided_decoding_disable_any_whitespace
-            )
-        if self.guided_decoding_disable_additional_properties is not None:
-            so_config.disable_additional_properties = (
-                self.guided_decoding_disable_additional_properties
-            )
-
         observability_config = ObservabilityConfig(
-            show_hidden_metrics_for_version=(self.show_hidden_metrics_for_version),
+            show_hidden_metrics_for_version=self.show_hidden_metrics_for_version,
             otlp_traces_endpoint=self.otlp_traces_endpoint,
             collect_detailed_traces=self.collect_detailed_traces,
         )
 
         # Compilation config overrides
+        compilation_config = copy.deepcopy(self.compilation_config)
         if self.cuda_graph_sizes is not None:
             logger.warning(
                 "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or "
                 "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes "
                 "instead."
             )
-            if self.compilation_config.cudagraph_capture_sizes is not None:
+            if compilation_config.cudagraph_capture_sizes is not None:
                 raise ValueError(
                     "cuda_graph_sizes and compilation_config."
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
-            self.compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
+            compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes
         if self.cudagraph_capture_sizes is not None:
-            if self.compilation_config.cudagraph_capture_sizes is not None:
+            if compilation_config.cudagraph_capture_sizes is not None:
                 raise ValueError(
                     "cudagraph_capture_sizes and compilation_config."
                     "cudagraph_capture_sizes are mutually exclusive"
                 )
-            self.compilation_config.cudagraph_capture_sizes = (
-                self.cudagraph_capture_sizes
-            )
+            compilation_config.cudagraph_capture_sizes = self.cudagraph_capture_sizes
         if self.max_cudagraph_capture_size is not None:
-            if self.compilation_config.max_cudagraph_capture_size is not None:
+            if compilation_config.max_cudagraph_capture_size is not None:
                 raise ValueError(
                     "max_cudagraph_capture_size and compilation_config."
                     "max_cudagraph_capture_size are mutually exclusive"
                 )
-            self.compilation_config.max_cudagraph_capture_size = (
+            compilation_config.max_cudagraph_capture_size = (
                 self.max_cudagraph_capture_size
             )
-
         config = VllmConfig(
             model_config=model_config,
             cache_config=cache_config,
@@ -1679,11 +1741,12 @@ def create_engine_config(
             load_config=load_config,
             structured_outputs_config=self.structured_outputs_config,
             observability_config=observability_config,
-            compilation_config=self.compilation_config,
+            compilation_config=compilation_config,
             kv_transfer_config=self.kv_transfer_config,
             kv_events_config=self.kv_events_config,
             ec_transfer_config=self.ec_transfer_config,
             additional_config=self.additional_config,
+            optimization_level=self.optimization_level,
         )
 
         return config
@@ -1733,41 +1796,15 @@ def _check_feature_supported(self, model_config: ModelConfig):
                 )
                 _raise_unsupported_error(feature_name=name)
 
-    def _set_default_args(
-        self, usage_context: UsageContext, model_config: ModelConfig
-    ) -> None:
-        """Set Default Arguments for V1 Engine."""
-
-        # V1 uses chunked prefills and prefix caching by default
-        # for non-pooling tasks.
-        # For pooling tasks the default is False
-        if model_config.runner_type != "pooling":
-            self.enable_chunked_prefill = True
-
-            if self.enable_prefix_caching is None:
-                # Disable prefix caching default for hybrid models
-                # since the feature is still experimental.
-                if model_config.is_hybrid:
-                    self.enable_prefix_caching = False
-                else:
-                    self.enable_prefix_caching = True
-        else:
-            pooling_type = model_config.pooler_config.pooling_type
-            is_causal = getattr(model_config.hf_config, "is_causal", True)
-            incremental_prefill_supported = (
-                pooling_type is not None
-                and pooling_type.lower() == "last"
-                and bool(is_causal)
-            )
-
-            action = "Enabling" if incremental_prefill_supported else "Disabling"
+    @classmethod
+    def get_batch_defaults(
+        cls,
+        world_size: int,
+    ) -> tuple[dict[UsageContext | None, int], dict[UsageContext | None, int]]:
+        from vllm.usage.usage_lib import UsageContext
 
-            if self.enable_chunked_prefill is None:
-                self.enable_chunked_prefill = incremental_prefill_supported
-                logger.info("(%s) chunked prefill by default", action)
-            if self.enable_prefix_caching is None:
-                self.enable_prefix_caching = incremental_prefill_supported
-                logger.info("(%s) prefix caching by default", action)
+        default_max_num_batched_tokens: dict[UsageContext | None, int]
+        default_max_num_seqs: dict[UsageContext | None, int]
 
         # When no user override, set the default values based on the usage
         # context.
@@ -1788,8 +1825,6 @@ def _set_default_args(
         # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces
         # throughput, see PR #17885 for more details.
         # So here we do an extra device name check to prevent such regression.
-        from vllm.usage.usage_lib import UsageContext
-
         if device_memory >= 70 * GiB_bytes and "a100" not in device_name:
             # For GPUs like H100 and MI300x, use larger default values.
             default_max_num_batched_tokens = {
@@ -1813,22 +1848,26 @@ def _set_default_args(
 
         # tpu specific default values.
         if current_platform.is_tpu():
-            default_max_num_batched_tokens_tpu = {
-                UsageContext.LLM_CLASS: {
-                    "V6E": 2048,
-                    "V5E": 1024,
-                    "V5P": 512,
-                },
-                UsageContext.OPENAI_API_SERVER: {
-                    "V6E": 1024,
-                    "V5E": 512,
-                    "V5P": 256,
-                },
-            }
+            chip_name = current_platform.get_device_name()
+
+            if chip_name == "V6E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 2048,
+                    UsageContext.OPENAI_API_SERVER: 1024,
+                }
+            elif chip_name == "V5E":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 1024,
+                    UsageContext.OPENAI_API_SERVER: 512,
+                }
+            elif chip_name == "V5P":
+                default_max_num_batched_tokens = {
+                    UsageContext.LLM_CLASS: 512,
+                    UsageContext.OPENAI_API_SERVER: 256,
+                }
 
         # cpu specific default values.
         if current_platform.is_cpu():
-            world_size = self.pipeline_parallel_size * self.tensor_parallel_size
             default_max_num_batched_tokens = {
                 UsageContext.LLM_CLASS: 4096 * world_size,
                 UsageContext.OPENAI_API_SERVER: 2048 * world_size,
@@ -1838,44 +1877,149 @@ def _set_default_args(
                 UsageContext.OPENAI_API_SERVER: 128 * world_size,
             }
 
-        use_context_value = usage_context.value if usage_context else None
-        if (
-            self.max_num_batched_tokens is None
-            and usage_context in default_max_num_batched_tokens
+        return default_max_num_batched_tokens, default_max_num_seqs
+
+    def _set_default_chunked_prefill_and_prefix_caching_args(
+        self, model_config: ModelConfig
+    ) -> None:
+        default_chunked_prefill = model_config.is_chunked_prefill_supported
+        default_prefix_caching = model_config.is_prefix_caching_supported
+
+        if self.prefill_context_parallel_size > 1:
+            default_chunked_prefill = False
+            default_prefix_caching = False
+            logger.warning_once(
+                "--prefill-context-parallel-size > 1 is not compatible with "
+                "chunked prefill and prefix caching now. Chunked prefill "
+                "and prefix caching have been disabled by default.",
+                scope="local",
+            )
+
+        if self.enable_chunked_prefill is None:
+            self.enable_chunked_prefill = default_chunked_prefill
+
+            logger.debug(
+                "%s chunked prefill by default",
+                "Enabling" if default_chunked_prefill else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "generate"
+            and not self.enable_chunked_prefill
+            and default_chunked_prefill
         ):
-            if current_platform.is_tpu():
-                chip_name = current_platform.get_device_name()
-                if chip_name in default_max_num_batched_tokens_tpu[usage_context]:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens_tpu[
-                        usage_context
-                    ][chip_name]
-                else:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens[
-                        usage_context
-                    ]
-            else:
-                if not self.enable_chunked_prefill:
-                    self.max_num_batched_tokens = model_config.max_model_len
-                else:
-                    self.max_num_batched_tokens = default_max_num_batched_tokens[
-                        usage_context
-                    ]
+            logger.warning_once(
+                "This model does not officially support disabling chunked prefill. "
+                "Disabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+                scope="local",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_chunked_prefill
+            and not default_chunked_prefill
+        ):
+            logger.warning_once(
+                "This model does not officially support chunked prefill. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+                scope="local",
+            )
+
+        if self.enable_prefix_caching is None:
+            self.enable_prefix_caching = default_prefix_caching
+
             logger.debug(
-                "Setting max_num_batched_tokens to %d for %s usage context.",
+                "%s prefix caching by default",
+                "Enabling" if default_prefix_caching else "Disabling",
+            )
+        elif (
+            model_config.runner_type == "pooling"
+            and self.enable_prefix_caching
+            and not default_prefix_caching
+        ):
+            logger.warning_once(
+                "This model does not officially support prefix caching. "
+                "Enabling this manually may cause the engine to crash "
+                "or produce incorrect outputs.",
+                scope="local",
+            )
+
+        # Disable chunked prefill and prefix caching for:
+        # POWER (ppc64le)/s390x/RISCV CPUs in V1
+        if current_platform.is_cpu() and current_platform.get_cpu_architecture() in (
+            CpuArchEnum.POWERPC,
+            CpuArchEnum.S390X,
+            CpuArchEnum.RISCV,
+        ):
+            logger.info(
+                "Chunked prefill is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
+            self.enable_chunked_prefill = False
+            logger.info(
+                "Prefix caching is not supported for ARM and POWER, "
+                "S390X and RISC-V CPUs; "
+                "disabling it for V1 backend."
+            )
+            self.enable_prefix_caching = False
+
+    def _set_default_max_num_seqs_and_batched_tokens_args(
+        self,
+        usage_context: UsageContext | None,
+        model_config: ModelConfig,
+    ):
+        world_size = self.pipeline_parallel_size * self.tensor_parallel_size
+        (
+            default_max_num_batched_tokens,
+            default_max_num_seqs,
+        ) = self.get_batch_defaults(world_size)
+
+        orig_max_num_batched_tokens = self.max_num_batched_tokens
+        orig_max_num_seqs = self.max_num_seqs
+
+        if self.max_num_batched_tokens is None:
+            self.max_num_batched_tokens = default_max_num_batched_tokens.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_BATCHED_TOKENS,
+            )
+
+        if self.max_num_seqs is None:
+            self.max_num_seqs = default_max_num_seqs.get(
+                usage_context,
+                SchedulerConfig.DEFAULT_MAX_NUM_SEQS,
+            )
+
+        if orig_max_num_batched_tokens is None:
+            if not self.enable_chunked_prefill:
+                # If max_model_len is too short, use the default for higher throughput.
+                self.max_num_batched_tokens = max(
+                    model_config.max_model_len,
+                    self.max_num_batched_tokens,
+                )
+
+            # When using default settings,
+            # Ensure max_num_batched_tokens does not exceed model limit.
+            # Some models (e.g., Whisper) have embeddings tied to max length.
+            self.max_num_batched_tokens = min(
+                self.max_num_seqs * model_config.max_model_len,
                 self.max_num_batched_tokens,
-                use_context_value,
             )
 
-        if self.max_num_seqs is None and usage_context in default_max_num_seqs:
-            self.max_num_seqs = min(
-                default_max_num_seqs[usage_context],
-                self.max_num_batched_tokens or sys.maxsize,
+            logger.debug(
+                "Defaulting max_num_batched_tokens to %d for %s usage context.",
+                self.max_num_batched_tokens,
+                usage_context.value if usage_context else None,
             )
 
+        if orig_max_num_seqs is None:
+            assert self.max_num_batched_tokens is not None  # For type checking
+            self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)
+
             logger.debug(
-                "Setting max_num_seqs to %d for %s usage context.",
+                "Defaulting max_num_seqs to %d for %s usage context.",
                 self.max_num_seqs,
-                use_context_value,
+                usage_context.value if usage_context else None,
             )
 
 
@@ -1885,24 +2029,6 @@ class AsyncEngineArgs(EngineArgs):
 
     enable_log_requests: bool = False
 
-    @property
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self) -> bool:
-        return not self.enable_log_requests
-
-    @disable_log_requests.setter
-    @deprecated(
-        "`disable_log_requests` is deprecated and has been replaced with "
-        "`enable_log_requests`. This will be removed in v0.12.0. Please use "
-        "`enable_log_requests` instead."
-    )
-    def disable_log_requests(self, value: bool):
-        self.enable_log_requests = not value
-
     @staticmethod
     def add_cli_args(
         parser: FlexibleArgumentParser, async_args_only: bool = False
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index 24fcd9fe1cab..f2b19c845018 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -1,30 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import enum
 from abc import ABC, abstractmethod
 from collections.abc import AsyncGenerator, Iterable, Mapping
 from typing import Any
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs.data import PromptType
-from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, RequestOutput
 from vllm.plugins.io_processors import IOProcessor
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.processor import Processor
-
-logger = init_logger(__name__)
-
-
-class Device(enum.Enum):
-    GPU = enum.auto()
-    CPU = enum.auto()
+from vllm.v1.engine.input_processor import InputProcessor
 
 
 class EngineClient(ABC):
@@ -32,7 +23,7 @@ class EngineClient(ABC):
 
     vllm_config: VllmConfig
     model_config: ModelConfig
-    processor: Processor
+    input_processor: InputProcessor
     io_processor: IOProcessor | None
 
     @property
@@ -94,7 +85,7 @@ async def abort(self, request_id: str | Iterable[str]) -> None:
         ...
 
     @abstractmethod
-    async def get_tokenizer(self) -> AnyTokenizer:
+    async def get_tokenizer(self) -> TokenizerLike:
         """Get the tokenizer"""
         ...
 
@@ -125,7 +116,7 @@ async def reset_mm_cache(self) -> None:
         ...
 
     @abstractmethod
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
+    async def reset_prefix_cache(self) -> None:
         """Reset the prefix cache"""
         ...
 
@@ -149,6 +140,33 @@ async def add_lora(self, lora_request: LoRARequest) -> bool:
         """Load a new LoRA adapter into the engine for future requests."""
         ...
 
+    @abstractmethod
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """Pause new generation/encoding requests.
+
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight requests
+                to finish before pausing. When ``False`` (default), aborts in-flight
+                requests immediately.
+            clear_cache: Whether to clear KV and prefix caches after draining.
+        """
+        ...
+
+    @abstractmethod
+    async def resume_generation(self) -> None:
+        """Resume accepting generation/encoding requests."""
+        ...
+
+    @abstractmethod
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
+        ...
+
     async def scale_elastic_ep(
         self, new_data_parallel_size: int, drain_timeout: int = 300
     ) -> None:
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index d7d6419d643b..1643906894c6 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -49,9 +49,9 @@
 from vllm.model_executor.models import SupportsMultiModal
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.transformers_utils.chat_templates import get_chat_template_fallback_path
 from vllm.transformers_utils.processor import cached_get_processor
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
 from vllm.utils.func_utils import supports_kw
 
@@ -94,6 +94,22 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
 
 
+class ChatCompletionContentPartAudioEmbedsParam(TypedDict, total=False):
+    audio_embeds: str | dict[str, str] | None
+    """
+    The audio embeddings. It can be either:
+    - A single base64 string representing a serialized torch tensor.
+    - A dictionary where each value is a base64 string.
+    """
+    type: Required[Literal["audio_embeds"]]
+    """The type of the content part."""
+    uuid: str | None
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
+
+
 class VideoURL(TypedDict, total=False):
     url: Required[str]
     """
@@ -211,6 +227,7 @@ class CustomThinkCompletionContentParam(TypedDict, total=False):
     | CustomChatCompletionContentPILImageParam
     | CustomChatCompletionContentSimpleImageParam
     | ChatCompletionContentPartImageEmbedsParam
+    | ChatCompletionContentPartAudioEmbedsParam
     | CustomChatCompletionContentSimpleAudioParam
     | CustomChatCompletionContentSimpleVideoParam
     | str
@@ -240,6 +257,9 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
     tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
 
 ChatCompletionMessageParam: TypeAlias = (
     OpenAIChatCompletionMessageParam
@@ -265,6 +285,12 @@ class ConversationMessage(TypedDict, total=False):
     tool_calls: Iterable[ChatCompletionMessageToolCallParam] | None
     """The tool calls generated by the model, such as function calls."""
 
+    reasoning: str | None
+    """The reasoning content for interleaved thinking."""
+
+    reasoning_content: str | None
+    """Deprecated: The reasoning content for interleaved thinking."""
+
 
 # Passed in by user
 ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
@@ -510,7 +536,7 @@ def resolve_hf_chat_template(
 def _resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -567,7 +593,7 @@ def resolve_chat_template_content_format(
     chat_template: str | None,
     tools: list[dict[str, Any]] | None,
     given_format: ChatTemplateContentFormatOption,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     *,
     model_config: ModelConfig,
 ) -> _ChatTemplateContentFormat:
@@ -590,7 +616,7 @@ def resolve_chat_template_content_format(
     return detected_format
 
 
-ModalityStr = Literal["image", "audio", "video", "image_embeds"]
+ModalityStr = Literal["image", "audio", "video", "image_embeds", "audio_embeds"]
 _T = TypeVar("_T")
 
 
@@ -601,7 +627,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
     maximum per prompt.
     """
 
-    def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
+    def __init__(self, model_config: ModelConfig, tokenizer: TokenizerLike):
         super().__init__()
 
         self._model_config = model_config
@@ -675,6 +701,11 @@ def all_mm_uuids(self) -> MultiModalUUIDDict | None:
             mm_uuids["image"] = uuids_by_modality["image_embeds"]
         if "image" in uuids_by_modality:
             mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
+        if "audio_embeds" in uuids_by_modality:
+            audio_embeds_uuids = uuids_by_modality["audio_embeds"]
+            if len(audio_embeds_uuids) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_uuids["audio"] = uuids_by_modality["audio_embeds"]
         if "audio" in uuids_by_modality:
             mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
         if "video" in uuids_by_modality:
@@ -694,6 +725,8 @@ def all_mm_data(self) -> MultiModalDataDict | None:
         items_by_modality = dict(self._items_by_modality)
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
+        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
+            raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
@@ -702,6 +735,11 @@ def all_mm_data(self) -> MultiModalDataDict | None:
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
+        if "audio_embeds" in items_by_modality:
+            audio_embeds_lst = items_by_modality["audio_embeds"]
+            if len(audio_embeds_lst) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_inputs["audio"] = audio_embeds_lst[0]
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -729,6 +767,8 @@ async def all_mm_data(self) -> MultiModalDataDict | None:
 
         if "image" in items_by_modality and "image_embeds" in items_by_modality:
             raise ValueError("Mixing raw image and embedding inputs is not allowed")
+        if "audio" in items_by_modality and "audio_embeds" in items_by_modality:
+            raise ValueError("Mixing raw audio and embedding inputs is not allowed")
 
         if "image_embeds" in items_by_modality:
             image_embeds_lst = items_by_modality["image_embeds"]
@@ -737,6 +777,11 @@ async def all_mm_data(self) -> MultiModalDataDict | None:
             mm_inputs["image"] = image_embeds_lst[0]
         if "image" in items_by_modality:
             mm_inputs["image"] = items_by_modality["image"]  # A list of images
+        if "audio_embeds" in items_by_modality:
+            audio_embeds_lst = items_by_modality["audio_embeds"]
+            if len(audio_embeds_lst) > 1:
+                raise ValueError("Only one message can have {'type': 'audio_embeds'}")
+            mm_inputs["audio"] = audio_embeds_lst[0]
         if "audio" in items_by_modality:
             mm_inputs["audio"] = items_by_modality["audio"]  # A list of audios
         if "video" in items_by_modality:
@@ -795,6 +840,14 @@ def parse_input_audio(
     ) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_video(self, video_url: str | None, uuid: str | None = None) -> None:
         raise NotImplementedError
@@ -852,6 +905,31 @@ def parse_image_embeds(
 
         self._add_placeholder("image", placeholder)
 
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `audio_embeds`"
+            )
+
+        if isinstance(audio_embeds, dict):
+            embeds = {
+                k: self._connector.fetch_audio_embedding(v)
+                for k, v in audio_embeds.items()
+            }
+            placeholder = self._tracker.add("audio_embeds", embeds, uuid)
+        elif isinstance(audio_embeds, str):
+            embedding = self._connector.fetch_audio_embedding(audio_embeds)
+            placeholder = self._tracker.add("audio_embeds", embedding, uuid)
+        else:
+            placeholder = self._tracker.add("audio_embeds", None, uuid)
+
+        self._add_placeholder("audio", placeholder)
+
     def parse_image_pil(
         self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
@@ -941,6 +1019,67 @@ def parse_image_embeds(
         placeholder = self._tracker.add("image_embeds", future, uuid)
         self._add_placeholder("image", placeholder)
 
+    def parse_audio_embeds(
+        self,
+        audio_embeds: str | dict[str, str] | None,
+        uuid: str | None = None,
+    ) -> None:
+        mm_config = self.model_config.get_multimodal_config()
+        if not mm_config.enable_mm_embeds:
+            raise ValueError(
+                "You must set `--enable-mm-embeds` to input `audio_embeds`"
+            )
+
+        logger.info(
+            "🎵 Parsing audio_embeds: type=%s, uuid=%s, is_dict=%s, "
+            "is_str=%s, is_none=%s",
+            type(audio_embeds).__name__,
+            uuid,
+            isinstance(audio_embeds, dict),
+            isinstance(audio_embeds, str),
+            audio_embeds is None,
+        )
+
+        future: asyncio.Future[str | dict[str, str] | None] = asyncio.Future()
+
+        if isinstance(audio_embeds, dict):
+            logger.info(
+                "🎵 Processing dict audio_embeds with %d entries",
+                len(audio_embeds),
+            )
+            embeds = {
+                k: self._connector.fetch_audio_embedding(v)
+                for k, v in audio_embeds.items()
+            }
+            future.set_result(embeds)
+            logger.info(
+                "🎵 Successfully loaded %d audio embeddings from dict",
+                len(embeds),
+            )
+
+        if isinstance(audio_embeds, str):
+            base64_size = len(audio_embeds)
+            logger.info(
+                "🎵 Processing base64 audio_embeds: %d chars (%.2f KB)",
+                base64_size,
+                base64_size / 1024,
+            )
+            embedding = self._connector.fetch_audio_embedding(audio_embeds)
+            future.set_result(embedding)
+            logger.info(
+                "🎵 Successfully loaded audio embedding tensor: shape=%s, dtype=%s",
+                embedding.shape,
+                embedding.dtype,
+            )
+
+        if audio_embeds is None:
+            logger.info("🎵 Audio embeds is None (UUID-only reference)")
+            future.set_result(None)
+
+        placeholder = self._tracker.add("audio_embeds", future, uuid)
+        self._add_placeholder("audio", placeholder)
+        logger.info("🎵 Added audio_embeds placeholder with uuid=%s", uuid)
+
     def parse_image_pil(
         self, image_pil: Image.Image | None, uuid: str | None = None
     ) -> None:
@@ -1123,6 +1262,7 @@ def _get_full_multimodal_text_prompt(
 # No need to validate using Pydantic again
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
+_AudioEmbedsParser = partial(cast, ChatCompletionContentPartAudioEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
@@ -1143,9 +1283,11 @@ def _get_full_multimodal_text_prompt(
     "text": lambda part: _TextParser(part).get("text", None),
     "thinking": lambda part: _ThinkParser(part).get("thinking", None),
     "input_text": lambda part: _TextParser(part).get("text", None),
+    "output_text": lambda part: _TextParser(part).get("text", None),
     "input_image": lambda part: _ResponsesInputImageParser(part).get("image_url", None),
     "image_url": lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds": lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "audio_embeds": lambda part: _AudioEmbedsParser(part).get("audio_embeds", None),
     "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio": lambda part: _InputAudioParser(part).get("input_audio", None),
@@ -1214,8 +1356,17 @@ def _parse_chat_message_content_mm_part(
             )
             image_embeds = image_params.get("image_embeds", None)
             return "image_embeds", image_embeds
+        if "audio_embeds" in part:
+            # "audio_embeds" could be None if UUID is provided.
+            audio_params = cast(  # type: ignore[assignment]
+                ChatCompletionContentPartAudioEmbedsParam, part
+            )
+            audio_embeds = audio_params.get("audio_embeds", None)
+            return "audio_embeds", audio_embeds
         if "audio_url" in part:
-            audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part)
+            audio_params = cast(  # type: ignore[assignment]
+                CustomChatCompletionContentSimpleAudioParam, part
+            )
             audio_url = audio_params.get("audio_url", None)
             if isinstance(audio_url, dict):
                 # Can potentially happen if user provides a uuid
@@ -1313,7 +1464,7 @@ def _parse_chat_message_content_part(
         )
         return None
 
-    if part_type in ("text", "input_text", "refusal", "thinking"):
+    if part_type in ("text", "input_text", "output_text", "refusal", "thinking"):
         str_content = cast(str, content)
         if wrap_dicts:
             return {"type": "text", "text": str_content}
@@ -1339,6 +1490,10 @@ def _parse_chat_message_content_part(
         content = cast(str | dict[str, str], content) if content is not None else None
         mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
+    elif part_type == "audio_embeds":
+        content = cast(str | dict[str, str], content) if content is not None else None
+        mm_parser.parse_audio_embeds(content, uuid)
+        modality = "audio"
     elif part_type == "audio_url":
         str_content = cast(str, content)
         mm_parser.parse_audio(str_content, uuid)
@@ -1374,7 +1529,7 @@ def _parse_chat_message_content(
 ) -> list[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
-
+    reasoning = message.get("reasoning") or message.get("reasoning_content")
     if content is None:
         content = []
     elif isinstance(content, str):
@@ -1396,6 +1551,12 @@ def _parse_chat_message_content(
             # follow the OpenAI spec.
             if "tool_calls" in parsed_msg and parsed_msg["tool_calls"] is not None:
                 result_msg["tool_calls"] = list(parsed_msg["tool_calls"])
+            # Include reasoning if present for interleaved thinking.
+            if reasoning is not None:
+                result_msg["reasoning"] = cast(str, reasoning)
+                result_msg["reasoning_content"] = cast(
+                    str, reasoning
+                )  # keep compatibility
         elif role == "tool":
             parsed_msg = _ToolParser(message)
             if "tool_call_id" in parsed_msg:
@@ -1422,7 +1583,8 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
             for item in message["tool_calls"]:
                 # if arguments is None or empty string, set to {}
                 if content := item["function"].get("arguments"):
-                    item["function"]["arguments"] = json.loads(content)
+                    if not isinstance(content, (dict, list)):
+                        item["function"]["arguments"] = json.loads(content)
                 else:
                     item["function"]["arguments"] = {}
 
@@ -1430,7 +1592,7 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
 def parse_chat_messages(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
@@ -1462,7 +1624,7 @@ def parse_chat_messages(
 def parse_chat_messages_futures(
     messages: list[ChatCompletionMessageParam],
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     content_format: _ChatTemplateContentFormat,
 ) -> tuple[
     list[ConversationMessage],
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 2678658dd126..96608f360e17 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -24,6 +24,7 @@
 from vllm.v1.engine.core import EngineCoreProc
 from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor import Executor
+from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
 from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
 
@@ -97,18 +98,40 @@ def run_headless(args: argparse.Namespace):
     if local_engine_count <= 0:
         raise ValueError("data_parallel_size_local must be > 0 in headless mode")
 
-    host = parallel_config.data_parallel_master_ip
-    port = engine_args.data_parallel_rpc_port  # add to config too
-    handshake_address = get_tcp_uri(host, port)
+    shutdown_requested = False
 
     # Catch SIGTERM and SIGINT to allow graceful shutdown.
     def signal_handler(signum, frame):
+        nonlocal shutdown_requested
         logger.debug("Received %d signal.", signum)
-        raise SystemExit
+        if not shutdown_requested:
+            shutdown_requested = True
+            raise SystemExit
 
     signal.signal(signal.SIGTERM, signal_handler)
     signal.signal(signal.SIGINT, signal_handler)
 
+    if parallel_config.node_rank_within_dp > 0:
+        from vllm.version import __version__ as VLLM_VERSION
+
+        # Run headless workers (for multi-node PP/TP).
+        host = parallel_config.master_addr
+        head_node_address = f"{host}:{parallel_config.master_port}"
+        logger.info(
+            "Launching vLLM (v%s) headless multiproc executor, "
+            "with head node address %s for torch.distributed process group.",
+            VLLM_VERSION,
+            head_node_address,
+        )
+
+        executor = MultiprocExecutor(vllm_config, monitor_workers=False)
+        executor.start_worker_monitor(inline=True)
+        return
+
+    host = parallel_config.data_parallel_master_ip
+    port = parallel_config.data_parallel_rpc_port
+    handshake_address = get_tcp_uri(host, port)
+
     logger.info(
         "Launching %d data parallel engine(s) in headless mode, "
         "with head node address %s.",
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 62717a7eacdf..f005605c08d7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -32,7 +32,6 @@
     TokenizerMode,
 )
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.protocol import Device
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
     ChatTemplateContentFormatOption,
@@ -72,11 +71,8 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
 from vllm.tasks import PoolingTask
-from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
-    MistralTokenizer,
-    get_cached_tokenizer,
-)
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.tokenizers.hf import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.collection_utils import as_iter, is_list_of
 from vllm.utils.counter import Counter
@@ -175,9 +171,6 @@ class LLM:
             For example, for Phi-3-Vision: `{"num_crops": 4}`.
         pooler_config: Initialize non-default pooling config for the pooling
             model. e.g. `PoolerConfig(pooling_type="mean", normalize=False)`.
-        override_pooler_config: [DEPRECATED] Use `pooler_config` instead. This
-            argument is deprecated and will be removed in v0.12.0 or v1.0.0,
-            whichever is sooner.
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the mode of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
@@ -195,7 +188,7 @@ def __init__(
         runner: RunnerOption = "auto",
         convert: ConvertOption = "auto",
         tokenizer: str | None = None,
-        tokenizer_mode: TokenizerMode = "auto",
+        tokenizer_mode: TokenizerMode | str = "auto",
         skip_tokenizer_init: bool = False,
         trust_remote_code: bool = False,
         allowed_local_media_path: str = "",
@@ -215,7 +208,6 @@ def __init__(
         hf_overrides: HfOverrides | None = None,
         mm_processor_kwargs: dict[str, Any] | None = None,
         pooler_config: PoolerConfig | None = None,
-        override_pooler_config: PoolerConfig | None = None,
         structured_outputs_config: dict[str, Any]
         | StructuredOutputsConfig
         | None = None,
@@ -331,7 +323,6 @@ def __init__(
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             pooler_config=pooler_config,
-            override_pooler_config=override_pooler_config,
             structured_outputs_config=structured_outputs_instance,
             compilation_config=compilation_config_instance,
             logits_processors=logits_processors,
@@ -340,7 +331,6 @@ def __init__(
 
         log_non_default_args(engine_args)
 
-        # Create the Engine (autoselects V0 vs V1)
         self.llm_engine = LLMEngine.from_engine_args(
             engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
         )
@@ -354,14 +344,14 @@ def __init__(
         self.supported_tasks = supported_tasks
 
         self.model_config = self.llm_engine.model_config
-        self.processor = self.llm_engine.processor
+        self.input_processor = self.llm_engine.input_processor
         self.io_processor = self.llm_engine.io_processor
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         return self.llm_engine.get_tokenizer()
 
     @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.")
-    def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+    def set_tokenizer(self, tokenizer: TokenizerLike) -> None:
         # While CachedTokenizer is dynamic, have no choice but
         # compare class name. Misjudgment will arise from
         # user-defined tokenizer started with 'Cached'
@@ -371,7 +361,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None:
             self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer)
 
     def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         self.llm_engine.reset_mm_cache()
 
     def get_default_sampling_params(self) -> SamplingParams:
@@ -412,6 +402,9 @@ def generate(
             lora_request: LoRA request to use for generation, if any.
             priority: The priority of the requests, if any.
                 Only applicable when priority scheduling policy is enabled.
+                If provided, must be a list of integers matching the length
+                of `prompts`, where each priority value corresponds to the prompt
+                at the same index.
 
         Returns:
             A list of `RequestOutput` objects containing the
@@ -467,7 +460,7 @@ def _get_modality_specific_lora_reqs(
         ):
             return lora_request
 
-        if not isinstance(prompts, Sequence):
+        if not isinstance(prompts, Sequence) or isinstance(prompts, str):
             prompts = [prompts]
 
         optional_loras = (
@@ -1248,7 +1241,7 @@ def reward(
 
     def _embedding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         text_1: list[str | TextPrompt | TokensPrompt],
         text_2: list[str | TextPrompt | TokensPrompt],
         truncate_prompt_tokens: int | None = None,
@@ -1280,7 +1273,7 @@ def _embedding_score(
 
     def _cross_encoding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         data_1: list[str] | list[ScoreContentPartParam],
         data_2: list[str] | list[ScoreContentPartParam],
         truncate_prompt_tokens: int | None = None,
@@ -1499,8 +1492,8 @@ def start_profile(self) -> None:
     def stop_profile(self) -> None:
         self.llm_engine.stop_profile()
 
-    def reset_prefix_cache(self, device: Device | None = None) -> None:
-        self.llm_engine.reset_prefix_cache(device)
+    def reset_prefix_cache(self) -> None:
+        self.llm_engine.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
         """
@@ -1678,7 +1671,7 @@ def _process_inputs(
             tokenization_kwargs,
         )
 
-        engine_request = self.processor.process_inputs(
+        engine_request = self.input_processor.process_inputs(
             request_id,
             engine_prompt,
             params,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index fbb2d32a229d..6a648822d9b2 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -14,7 +14,7 @@
 import tempfile
 import uuid
 from argparse import Namespace
-from collections.abc import AsyncGenerator, AsyncIterator, Awaitable, Callable
+from collections.abc import AsyncGenerator, AsyncIterator, Awaitable
 from contextlib import asynccontextmanager
 from http import HTTPStatus
 from typing import Annotated, Any, Literal
@@ -39,7 +39,7 @@
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.protocol import Device, EngineClient
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.anthropic.protocol import (
     AnthropicError,
     AnthropicErrorResponse,
@@ -54,27 +54,16 @@
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
     ChatCompletionResponse,
-    ClassificationRequest,
-    ClassificationResponse,
     CompletionRequest,
     CompletionResponse,
     DetokenizeRequest,
     DetokenizeResponse,
-    EmbeddingBytesResponse,
-    EmbeddingRequest,
-    EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
-    IOProcessorResponse,
-    PoolingBytesResponse,
-    PoolingRequest,
-    PoolingResponse,
-    RerankRequest,
-    RerankResponse,
+    GenerateRequest,
+    GenerateResponse,
     ResponsesRequest,
     ResponsesResponse,
-    ScoreRequest,
-    ScoreResponse,
     StreamingResponsesResponse,
     TokenizeRequest,
     TokenizeResponse,
@@ -84,23 +73,25 @@
     TranslationResponse,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_classification import ServingClassification
 from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
-from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import (
     BaseModelPath,
     OpenAIServingModels,
 )
-from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling
 from vllm.entrypoints.openai.serving_responses import OpenAIServingResponses
-from vllm.entrypoints.openai.serving_score import ServingScores
 from vllm.entrypoints.openai.serving_tokenization import OpenAIServingTokenization
+from vllm.entrypoints.openai.serving_tokens import ServingTokens
 from vllm.entrypoints.openai.serving_transcription import (
     OpenAIServingTranscription,
     OpenAIServingTranslation,
 )
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.classify.serving import ServingClassification
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
+from vllm.entrypoints.pooling.score.serving import ServingScores
 from vllm.entrypoints.tool_server import DemoToolServer, MCPToolServer, ToolServer
 from vllm.entrypoints.utils import (
     cli_env_setup,
@@ -251,15 +242,6 @@ async def build_async_engine_client_from_engine_args(
             async_llm.shutdown()
 
 
-async def validate_json_request(raw_request: Request):
-    content_type = raw_request.headers.get("content-type", "").lower()
-    media_type = content_type.split(";", maxsplit=1)[0]
-    if media_type != "application/json":
-        raise RequestValidationError(
-            errors=["Unsupported Media Type: Only 'application/json' is allowed"]
-        )
-
-
 router = APIRouter()
 
 
@@ -321,26 +303,6 @@ def completion(request: Request) -> OpenAIServingCompletion | None:
     return request.app.state.openai_serving_completion
 
 
-def pooling(request: Request) -> OpenAIServingPooling | None:
-    return request.app.state.openai_serving_pooling
-
-
-def embedding(request: Request) -> OpenAIServingEmbedding | None:
-    return request.app.state.openai_serving_embedding
-
-
-def score(request: Request) -> ServingScores | None:
-    return request.app.state.openai_serving_scores
-
-
-def classify(request: Request) -> ServingClassification | None:
-    return request.app.state.openai_serving_classification
-
-
-def rerank(request: Request) -> ServingScores | None:
-    return request.app.state.openai_serving_scores
-
-
 def tokenization(request: Request) -> OpenAIServingTokenization:
     return request.app.state.openai_serving_tokenization
 
@@ -357,6 +319,10 @@ def engine_client(request: Request) -> EngineClient:
     return request.app.state.engine_client
 
 
+def generate_tokens(request: Request) -> ServingTokens | None:
+    return request.app.state.serving_tokens
+
+
 @router.get("/health", response_class=Response)
 async def health(raw_request: Request) -> Response:
     """Health check."""
@@ -386,6 +352,84 @@ async def get_server_load_metrics(request: Request):
     return JSONResponse(content={"server_load": request.app.state.server_load_metrics})
 
 
+@router.post("/pause")
+async def pause_generation(
+    raw_request: Request,
+    wait_for_inflight_requests: bool = Query(False),
+    clear_cache: bool = Query(True),
+) -> JSONResponse:
+    """Pause generation requests to allow weight updates.
+
+    Args:
+        wait_for_inflight_requests: When ``True`` waits for in-flight
+            requests to finish before pausing. When ``False`` (default),
+            aborts any in-flight requests immediately.
+        clear_cache: Whether to clear KV/prefix caches after draining.
+    """
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.pause_generation(
+            wait_for_inflight_requests=wait_for_inflight_requests,
+            clear_cache=clear_cache,
+        )
+        return JSONResponse(
+            content={"status": "paused"},
+            status_code=HTTPStatus.OK.value,
+        )
+
+    except ValueError as err:
+        return JSONResponse(
+            content={"error": str(err)},
+            status_code=HTTPStatus.BAD_REQUEST.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to pause generation")
+        return JSONResponse(
+            content={"error": f"Failed to pause generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.post("/resume")
+async def resume_generation(raw_request: Request) -> JSONResponse:
+    """Resume generation after a pause."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        await engine.resume_generation()
+        return JSONResponse(
+            content={"status": "resumed"},
+            status_code=HTTPStatus.OK.value,
+        )
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to resume generation")
+        return JSONResponse(
+            content={"error": f"Failed to resume generation: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+
+@router.get("/is_paused")
+async def is_paused(raw_request: Request) -> JSONResponse:
+    """Return the current pause status."""
+
+    engine = engine_client(raw_request)
+
+    try:
+        paused = await engine.is_paused()
+    except Exception as err:  # pragma: no cover - defensive
+        logger.exception("Failed to fetch pause status")
+        return JSONResponse(
+            content={"error": f"Failed to fetch pause status: {err}"},
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value,
+        )
+
+    return JSONResponse(content={"is_paused": paused})
+
+
 @router.post(
     "/tokenize",
     dependencies=[Depends(validate_json_request)],
@@ -732,166 +776,6 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post(
-    "/v1/embeddings",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_embedding(
-    request: EmbeddingRequest,
-    raw_request: Request,
-):
-    handler = embedding(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Embeddings API"
-        )
-
-    try:
-        generator = await handler.create_embedding(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, EmbeddingResponse):
-        return JSONResponse(content=generator.model_dump())
-    elif isinstance(generator, EmbeddingBytesResponse):
-        return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
-            media_type=generator.media_type,
-        )
-
-    assert_never(generator)
-
-
-@router.post(
-    "/pooling",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_pooling(request: PoolingRequest, raw_request: Request):
-    handler = pooling(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Pooling API"
-        )
-    try:
-        generator = await handler.create_pooling(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
-        return JSONResponse(content=generator.model_dump())
-    elif isinstance(generator, PoolingBytesResponse):
-        return StreamingResponse(
-            content=generator.body,
-            headers={"metadata": generator.metadata},
-            media_type=generator.media_type,
-        )
-
-    assert_never(generator)
-
-
-@router.post("/classify", dependencies=[Depends(validate_json_request)])
-@with_cancellation
-@load_aware_call
-async def create_classify(request: ClassificationRequest, raw_request: Request):
-    handler = classify(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Classification API"
-        )
-
-    try:
-        generator = await handler.create_classify(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-
-    elif isinstance(generator, ClassificationResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/score",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_score(request: ScoreRequest, raw_request: Request):
-    handler = score(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Score API"
-        )
-
-    try:
-        generator = await handler.create_score(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, ScoreResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/v1/score",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def create_score_v1(request: ScoreRequest, raw_request: Request):
-    logger.warning(
-        "To indicate that Score API is not part of standard OpenAI API, we "
-        "have moved it to `/score`. Please update your client accordingly."
-    )
-
-    return await create_score(request, raw_request)
-
-
 @router.post(
     "/v1/audio/transcriptions",
     responses={
@@ -970,70 +854,6 @@ async def create_translations(
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
-@router.post(
-    "/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-@load_aware_call
-async def do_rerank(request: RerankRequest, raw_request: Request):
-    handler = rerank(raw_request)
-    if handler is None:
-        return base(raw_request).create_error_response(
-            message="The model does not support Rerank (Score) API"
-        )
-    try:
-        generator = await handler.do_rerank(request, raw_request)
-    except Exception as e:
-        raise HTTPException(
-            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
-        ) from e
-    if isinstance(generator, ErrorResponse):
-        return JSONResponse(
-            content=generator.model_dump(), status_code=generator.error.code
-        )
-    elif isinstance(generator, RerankResponse):
-        return JSONResponse(content=generator.model_dump())
-
-    assert_never(generator)
-
-
-@router.post(
-    "/v1/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def do_rerank_v1(request: RerankRequest, raw_request: Request):
-    logger.warning_once(
-        "To indicate that the rerank API is not part of the standard OpenAI"
-        " API, we have located it at `/rerank`. Please update your client "
-        "accordingly. (Note: Conforms to JinaAI rerank API)"
-    )
-
-    return await do_rerank(request, raw_request)
-
-
-@router.post(
-    "/v2/rerank",
-    dependencies=[Depends(validate_json_request)],
-    responses={
-        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
-        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
-    },
-)
-@with_cancellation
-async def do_rerank_v2(request: RerankRequest, raw_request: Request):
-    return await do_rerank(request, raw_request)
-
-
 if envs.VLLM_SERVER_DEV_MODE:
     logger.warning(
         "SECURITY WARNING: Development endpoints are enabled! "
@@ -1062,12 +882,8 @@ async def reset_prefix_cache(raw_request: Request):
         Reset the prefix cache. Note that we currently do not check if the
         prefix cache is successfully reset in the API server.
         """
-        device = None
-        device_str = raw_request.query_params.get("device")
-        if device_str is not None:
-            device = Device[device_str.upper()]
-        logger.info("Resetting prefix cache with specific %s...", str(device))
-        await engine_client(raw_request).reset_prefix_cache(device)
+        logger.info("Resetting prefix cache...")
+        await engine_client(raw_request).reset_prefix_cache()
         return Response(status_code=200)
 
     @router.post("/reset_mm_cache")
@@ -1204,28 +1020,39 @@ async def is_scaling_elastic_ep(raw_request: Request):
     return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep})
 
 
-# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
-# (requires typing_extensions >= 4.13)
-RequestType = Any
-GetHandlerFn = Callable[[Request], OpenAIServing | None]
-EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
+@router.post(
+    "/inference/v1/generate",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {"content": {"text/event-stream": {}}},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def generate(request: GenerateRequest, raw_request: Request):
+    handler = generate_tokens(raw_request)
+    if handler is None:
+        return base(raw_request).create_error_response(
+            message="The model does not support generate tokens API"
+        )
+    try:
+        generator = await handler.serve_tokens(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
 
-# NOTE: Items defined earlier take higher priority
-INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
-    (ChatCompletionRequest, (chat, create_chat_completion)),
-    (CompletionRequest, (completion, create_completion)),
-    (EmbeddingRequest, (embedding, create_embedding)),
-    (ClassificationRequest, (classify, create_classify)),
-    (ScoreRequest, (score, create_score)),
-    (RerankRequest, (rerank, do_rerank)),
-    (PoolingRequest, (pooling, create_pooling)),
-]
+    elif isinstance(generator, GenerateResponse):
+        return JSONResponse(content=generator.model_dump())
 
-# NOTE: Construct the TypeAdapters only once
-INVOCATION_VALIDATORS = [
-    (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
-    for request_type, (get_handler, endpoint) in INVOCATION_TYPES
-]
+    return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
 if envs.VLLM_TORCH_PROFILER_DIR:
@@ -1537,12 +1364,16 @@ def build_app(args: Namespace) -> FastAPI:
     from vllm.entrypoints.sagemaker.routes import register_sagemaker_routes
 
     register_sagemaker_routes(router)
-
     app.include_router(router)
+
     app.root_path = args.root_path
 
     mount_metrics(app)
 
+    from vllm.entrypoints.pooling import register_pooling_api_routers
+
+    register_pooling_api_routers(app)
+
     app.add_middleware(
         CORSMiddleware,
         allow_origins=args.allowed_origins,
@@ -1629,6 +1460,31 @@ async def log_response(request: Request, call_next):
             )
 
     app = sagemaker_standards.bootstrap(app)
+    # Optional endpoints
+    if args.tokens_only:
+
+        @app.post("/abort_requests")
+        async def abort_requests(raw_request: Request):
+            """
+            Abort one or more requests. To be used in a
+            Disaggregated Everything setup.
+            """
+            try:
+                body = await raw_request.json()
+            except json.JSONDecodeError as e:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail=f"JSON decode error: {e}",
+                ) from e
+            request_ids = body.get("request_ids")
+            if request_ids is None:
+                raise HTTPException(
+                    status_code=HTTPStatus.BAD_REQUEST.value,
+                    detail="Missing 'request_ids' in request body",
+                )
+            # Abort requests in background
+            asyncio.create_task(engine_client(raw_request).abort(request_ids))
+            return Response(status_code=200)
 
     return app
 
@@ -1784,6 +1640,9 @@ async def init_app_state(
             engine_client,
             state.openai_serving_models,
             request_logger=request_logger,
+            chat_template=resolved_chat_template,
+            chat_template_content_format=args.chat_template_content_format,
+            trust_request_chat_template=args.trust_request_chat_template,
             log_error_stack=args.log_error_stack,
         )
         if "classify" in supported_tasks
@@ -1848,6 +1707,20 @@ async def init_app_state(
         if "generate" in supported_tasks
         else None
     )
+    state.serving_tokens = (
+        ServingTokens(
+            engine_client,
+            state.openai_serving_models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=args.return_tokens_as_token_ids,
+            log_error_stack=args.log_error_stack,
+            enable_prompt_tokens_details=args.enable_prompt_tokens_details,
+            enable_log_outputs=args.enable_log_outputs,
+            force_no_detokenize=args.tokens_only,
+        )
+        if "generate" in supported_tasks
+        else None
+    )
 
     state.enable_server_load_tracking = args.enable_server_load_tracking
     state.server_load_metrics = 0
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 476587c17823..946362ce2ef0 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -189,6 +189,11 @@ class FrontendArgs:
     Helps mitigate header abuse. Default: 256."""
     log_error_stack: bool = envs.VLLM_SERVER_DEV_MODE
     """If set to True, log the stack trace of error responses"""
+    tokens_only: bool = False
+    """
+    If set to True, only enable the Tokens In<>Out endpoint. 
+    This is intended for use in a Disaggregated Everything setup.
+    """
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 69e757d4764d..fb73416f45b2 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -6,7 +6,7 @@
 import json
 import time
 from http import HTTPStatus
-from typing import Annotated, Any, ClassVar, Generic, Literal, TypeAlias, TypeVar
+from typing import Annotated, Any, ClassVar, Literal, TypeAlias
 
 import regex as re
 import torch
@@ -29,7 +29,6 @@
     ResponseOutputItemAddedEvent,
     ResponseOutputItemDoneEvent,
     ResponsePrompt,
-    ResponseReasoningItem,
     ResponseReasoningTextDeltaEvent,
     ResponseReasoningTextDoneEvent,
     ResponseStatus,
@@ -49,14 +48,6 @@
 )
 from openai_harmony import Message as OpenAIHarmonyMessage
 
-from vllm.config.pooler import get_use_activation
-from vllm.tasks import PoolingTask
-from vllm.utils.serial_utils import (
-    EmbedDType,
-    EncodingFormat,
-    Endianness,
-)
-
 # Backward compatibility for OpenAI client versions
 try:  # For older openai versions (< 1.100.0)
     from openai.types.responses import ResponseTextConfig
@@ -71,19 +62,14 @@
     BaseModel,
     ConfigDict,
     Field,
-    TypeAdapter,
     ValidationError,
-    ValidationInfo,
     field_serializer,
-    field_validator,
     model_validator,
 )
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam, make_tool_call_id
-from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
-from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import (
     BeamSearchParams,
     RequestOutputKind,
@@ -304,9 +290,7 @@ def get_logits_processors(
     return None
 
 
-ResponseInputOutputItem: TypeAlias = (
-    ResponseInputItemParam | ResponseReasoningItem | ResponseFunctionToolCall
-)
+ResponseInputOutputItem: TypeAlias = ResponseInputItemParam | ResponseOutputItem
 
 
 class ResponsesRequest(OpenAIBaseModel):
@@ -377,7 +361,7 @@ class ResponsesRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
 
@@ -559,13 +543,12 @@ class ChatCompletionRequest(OpenAIBaseModel):
     ) = "none"
     reasoning_effort: Literal["low", "medium", "high"] | None = None
     include_reasoning: bool = True
+    parallel_tool_calls: bool | None = True
 
-    # NOTE this will be ignored by vLLM -- the model determines the behavior
-    parallel_tool_calls: bool | None = False
+    # NOTE this will be ignored by vLLM
     user: str | None = None
 
     # --8<-- [start:chat-completion-sampling-params]
-    best_of: int | None = None
     use_beam_search: bool = False
     top_k: int | None = None
     min_p: float | None = None
@@ -653,62 +636,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=(
-            "`structural_tag` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `structural_tag` to `structured_outputs` instead."
-        ),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -718,7 +645,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -764,7 +691,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
     kv_transfer_params: dict[str, Any] | None = Field(
@@ -842,20 +769,6 @@ def to_sampling_params(
         if prompt_logprobs is None and self.echo:
             prompt_logprobs = self.top_logprobs
 
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-                structural_tag=self.structural_tag,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
-
         response_format = self.response_format
         if response_format is not None:
             # If structured outputs wasn't already enabled,
@@ -864,24 +777,23 @@ def to_sampling_params(
                 self.structured_outputs = StructuredOutputsParams()
 
             # Set structured output params for response format
-            if response_format is not None:
-                if response_format.type == "json_object":
-                    self.structured_outputs.json_object = True
-                elif response_format.type == "json_schema":
-                    json_schema = response_format.json_schema
-                    assert json_schema is not None
-                    self.structured_outputs.json = json_schema.json_schema
-                elif response_format.type == "structural_tag":
-                    structural_tag = response_format
-                    assert structural_tag is not None and isinstance(
-                        structural_tag,
-                        (
-                            LegacyStructuralTagResponseFormat,
-                            StructuralTagResponseFormat,
-                        ),
-                    )
-                    s_tag_obj = structural_tag.model_dump(by_alias=True)
-                    self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
+                assert json_schema is not None
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
+                assert structural_tag is not None and isinstance(
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
+                )
+                s_tag_obj = structural_tag.model_dump(by_alias=True)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -889,7 +801,6 @@ def to_sampling_params(
             extra_args["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
@@ -1088,7 +999,6 @@ class CompletionRequest(OpenAIBaseModel):
     # https://platform.openai.com/docs/api-reference/completions/create
     model: str | None = None
     prompt: list[int] | list[list[int]] | str | list[str] | None = None
-    best_of: int | None = None
     echo: bool | None = False
     frequency_penalty: float | None = 0.0
     logit_bias: dict[str, float] | None = None
@@ -1143,58 +1053,6 @@ class CompletionRequest(OpenAIBaseModel):
         default=None,
         description="Additional kwargs for structured outputs",
     )
-    guided_json: str | dict | BaseModel | None = Field(
-        default=None,
-        description=(
-            "`guided_json` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `json` to `structured_outputs` instead."
-        ),
-    )
-    guided_regex: str | None = Field(
-        default=None,
-        description=(
-            "`guided_regex` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `regex` to `structured_outputs` instead."
-        ),
-    )
-    guided_choice: list[str] | None = Field(
-        default=None,
-        description=(
-            "`guided_choice` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `choice` to `structured_outputs` instead."
-        ),
-    )
-    guided_grammar: str | None = Field(
-        default=None,
-        description=(
-            "`guided_grammar` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `grammar` to `structured_outputs` instead."
-        ),
-    )
-    structural_tag: str | None = Field(
-        default=None,
-        description=("If specified, the output will follow the structural tag schema."),
-    )
-    guided_decoding_backend: str | None = Field(
-        default=None,
-        description=(
-            "`guided_decoding_backend` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please remove it from your request."
-        ),
-    )
-    guided_whitespace_pattern: str | None = Field(
-        default=None,
-        description=(
-            "`guided_whitespace_pattern` is deprecated. "
-            "This will be removed in v0.12.0 or v1.0.0, whichever is soonest. "
-            "Please pass `whitespace_pattern` to `structured_outputs` instead."
-        ),
-    )
     priority: int = Field(
         default=0,
         description=(
@@ -1204,7 +1062,7 @@ class CompletionRequest(OpenAIBaseModel):
         ),
     )
     request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
+        default_factory=random_uuid,
         description=(
             "The request_id related to this request. If the caller does "
             "not set it, a random_uuid will be generated. This id is used "
@@ -1252,7 +1110,7 @@ class CompletionRequest(OpenAIBaseModel):
             "environments. The salt should be random, protected from "
             "access by 3rd parties, and long enough to be "
             "unpredictable (e.g., 43 characters base64-encoded, corresponding "
-            "to 256 bit). Not supported by vLLM engine V0."
+            "to 256 bit)."
         ),
     )
 
@@ -1339,35 +1197,31 @@ def to_sampling_params(
 
         echo_without_generation = self.echo and self.max_tokens == 0
 
-        guided_json_object = None
-        if self.response_format is not None:
-            if self.response_format.type == "json_object":
-                guided_json_object = True
-            elif self.response_format.type == "json_schema":
-                json_schema = self.response_format.json_schema
+        response_format = self.response_format
+        if response_format is not None:
+            # If structured outputs wasn't already enabled,
+            # we must enable it for these features to work
+            if self.structured_outputs is None:
+                self.structured_outputs = StructuredOutputsParams()
+
+            # Set structured output params for response format
+            if response_format.type == "json_object":
+                self.structured_outputs.json_object = True
+            elif response_format.type == "json_schema":
+                json_schema = response_format.json_schema
                 assert json_schema is not None
-                self.guided_json = json_schema.json_schema
-            elif self.response_format.type == "structural_tag":
-                structural_tag = self.response_format
+                self.structured_outputs.json = json_schema.json_schema
+            elif response_format.type == "structural_tag":
+                structural_tag = response_format
                 assert structural_tag is not None and isinstance(
-                    structural_tag, StructuralTagResponseFormat
+                    structural_tag,
+                    (
+                        LegacyStructuralTagResponseFormat,
+                        StructuralTagResponseFormat,
+                    ),
                 )
                 s_tag_obj = structural_tag.model_dump(by_alias=True)
-                self.structural_tag = json.dumps(s_tag_obj)
-
-        # Forward deprecated guided_* parameters to structured_outputs
-        if self.structured_outputs is None:
-            kwargs = dict[str, Any](
-                json=self.guided_json,
-                json_object=guided_json_object,
-                regex=self.guided_regex,
-                choice=self.guided_choice,
-                grammar=self.guided_grammar,
-                whitespace_pattern=self.guided_whitespace_pattern,
-            )
-            kwargs = {k: v for k, v in kwargs.items() if v is not None}
-            if len(kwargs) > 0:
-                self.structured_outputs = StructuredOutputsParams(**kwargs)
+                self.structured_outputs.structural_tag = json.dumps(s_tag_obj)
 
         extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
         if self.kv_transfer_params:
@@ -1375,7 +1229,6 @@ def to_sampling_params(
             extra_args["kv_transfer_params"] = self.kv_transfer_params
         return SamplingParams.from_optional(
             n=self.n,
-            best_of=self.best_of,
             presence_penalty=self.presence_penalty,
             frequency_penalty=self.frequency_penalty,
             repetition_penalty=repetition_penalty,
@@ -1479,401 +1332,6 @@ def check_cache_salt_support(cls, data):
         return data
 
 
-class EmbeddingCompletionRequest(OpenAIBaseModel):
-    # Ordered by official OpenAI API documentation
-    # https://platform.openai.com/docs/api-reference/embeddings
-    model: str | None = None
-    input: list[int] | list[list[int]] | str | list[str]
-    encoding_format: EncodingFormat = "float"
-    dimensions: int | None = None
-    user: str | None = None
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:embedding-extra-params]
-    add_special_tokens: bool = Field(
-        default=True,
-        description=(
-            "If true (the default), special tokens (e.g. BOS) will be added to "
-            "the prompt."
-        ),
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    normalize: bool | None = Field(
-        default=None,
-        description="Whether to normalize the embeddings outputs. Default is True.",
-    )
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    # --8<-- [end:embedding-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-        )
-
-
-class EmbeddingChatRequest(OpenAIBaseModel):
-    model: str | None = None
-    messages: list[ChatCompletionMessageParam]
-
-    encoding_format: EncodingFormat = "float"
-    dimensions: int | None = None
-    user: str | None = None
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:chat-embedding-extra-params]
-    add_generation_prompt: bool = Field(
-        default=False,
-        description=(
-            "If true, the generation prompt will be added to the chat template. "
-            "This is a parameter used by chat template in tokenizer config of the "
-            "model."
-        ),
-    )
-
-    add_special_tokens: bool = Field(
-        default=False,
-        description=(
-            "If true, special tokens (e.g. BOS) will be added to the prompt "
-            "on top of what is added by the chat template. "
-            "For most models, the chat template takes care of adding the "
-            "special tokens so this should be set to false (as is the "
-            "default)."
-        ),
-    )
-    chat_template: str | None = Field(
-        default=None,
-        description=(
-            "A Jinja template to use for this conversion. "
-            "As of transformers v4.44, default chat template is no longer "
-            "allowed, so you must provide a chat template if the tokenizer "
-            "does not define one."
-        ),
-    )
-    chat_template_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=(
-            "Additional keyword args to pass to the template renderer. "
-            "Will be accessible by the chat template."
-        ),
-    )
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-    request_id: str = Field(
-        default_factory=lambda: f"{random_uuid()}",
-        description=(
-            "The request_id related to this request. If the caller does "
-            "not set it, a random_uuid will be generated. This id is used "
-            "through out the inference process and return in response."
-        ),
-    )
-    normalize: bool | None = Field(
-        default=None,
-        description="Whether to normalize the embeddings outputs. Default is True.",
-    )
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    # --8<-- [end:chat-embedding-extra-params]
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_generation_prompt(cls, data):
-        if data.get("continue_final_message") and data.get("add_generation_prompt"):
-            raise ValueError(
-                "Cannot set both `continue_final_message` and "
-                "`add_generation_prompt` to True."
-            )
-        return data
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-        )
-
-
-EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
-
-
-class PoolingCompletionRequest(EmbeddingCompletionRequest):
-    task: PoolingTask | None = None
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "If it is a classify or token_classify task, the default is True; "
-        "for other tasks, this value should be None.",
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-            use_activation=get_use_activation(self),
-        )
-
-
-class PoolingChatRequest(EmbeddingChatRequest):
-    task: PoolingTask | None = None
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "If it is a classify or token_classify task, the default is True; "
-        "for other tasks, this value should be None.",
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            dimensions=self.dimensions,
-            normalize=self.normalize,
-            use_activation=get_use_activation(self),
-        )
-
-
-T = TypeVar("T")
-
-
-class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
-    model: str | None = None
-
-    priority: int = Field(default=0)
-    """
-    The priority of the request (lower means earlier handling;
-    default: 0). Any priority other than 0 will raise an error
-    if the served model does not use priority scheduling.
-    """
-    data: T
-
-    task: PoolingTask = "plugin"
-    encoding_format: EncodingFormat = "float"
-    embed_dtype: EmbedDType = Field(
-        default="float32",
-        description=(
-            "What dtype to use for encoding. Default to using float32 for base64 "
-            "encoding to match the OpenAI python client behavior. "
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-    endianness: Endianness = Field(
-        default="native",
-        description=(
-            "What endianness to use for encoding. Default to using native for "
-            "base64 encoding to match the OpenAI python client behavior."
-            "This parameter will affect base64 and binary_response."
-        ),
-    )
-
-    def to_pooling_params(self):
-        return PoolingParams()
-
-
-class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
-    request_id: str | None = None
-    """
-    The request_id associated with this response
-    """
-    created_at: int = Field(default_factory=lambda: int(time.time()))
-
-    data: T
-    """
-    When using plugins IOProcessor plugins, the actual output is generated
-    by the plugin itself. Hence, we use a generic type for the response data
-    """
-
-
-PoolingRequest: TypeAlias = (
-    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
-)
-
-
-class ScoreRequest(OpenAIBaseModel):
-    model: str | None = None
-    text_1: list[str] | str | ScoreMultiModalParam
-    text_2: list[str] | str | ScoreMultiModalParam
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:score-extra-params]
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:score-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class RerankRequest(OpenAIBaseModel):
-    model: str | None = None
-    query: str | ScoreMultiModalParam
-    documents: list[str] | ScoreMultiModalParam
-    top_n: int = Field(default_factory=lambda: 0)
-    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
-
-    # --8<-- [start:rerank-extra-params]
-
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:rerank-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class RerankDocument(BaseModel):
-    text: str | None = None
-    multi_modal: ScoreContentPartParam | None = None
-
-
-class RerankResult(BaseModel):
-    index: int
-    document: RerankDocument
-    relevance_score: float
-
-
-class RerankUsage(BaseModel):
-    total_tokens: int
-
-
-class RerankResponse(OpenAIBaseModel):
-    id: str
-    model: str
-    usage: RerankUsage
-    results: list[RerankResult]
-
-
 class CompletionLogProbs(OpenAIBaseModel):
     text_offset: list[int] = Field(default_factory=list)
     token_logprobs: list[float | None] = Field(default_factory=list)
@@ -1943,119 +1401,6 @@ class CompletionStreamResponse(OpenAIBaseModel):
     usage: UsageInfo | None = Field(default=None)
 
 
-class EmbeddingResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "embedding"
-    embedding: list[float] | str
-
-
-class EmbeddingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[EmbeddingResponseData]
-    usage: UsageInfo
-
-
-class EmbeddingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
-    media_type: str = "application/octet-stream"
-
-
-class PoolingResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "pooling"
-    data: list[list[float]] | list[float] | str
-
-
-class PoolingResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[PoolingResponseData]
-    usage: UsageInfo
-
-
-class PoolingBytesResponse(OpenAIBaseModel):
-    body: list[bytes]
-    metadata: str
-    media_type: str = "application/octet-stream"
-
-
-class ScoreResponseData(OpenAIBaseModel):
-    index: int
-    object: str = "score"
-    score: float
-
-
-class ScoreResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[ScoreResponseData]
-    usage: UsageInfo
-
-
-class ClassificationRequest(OpenAIBaseModel):
-    model: str | None = None
-    input: list[str] | str
-    truncate_prompt_tokens: int | None = None
-    user: str | None = None
-
-    # --8<-- [start:classification-extra-params]
-    priority: int = Field(
-        default=0,
-        description=(
-            "The priority of the request (lower means earlier handling; "
-            "default: 0). Any priority other than 0 will raise an error "
-            "if the served model does not use priority scheduling."
-        ),
-    )
-
-    softmax: bool | None = Field(
-        default=None,
-        description="softmax will be deprecated, please use use_activation instead.",
-    )
-
-    activation: bool | None = Field(
-        default=None,
-        description="activation will be deprecated, please use use_activation instead.",
-    )
-
-    use_activation: bool | None = Field(
-        default=None,
-        description="Whether to use activation for classification outputs. "
-        "Default is True.",
-    )
-    # --8<-- [end:classification-extra-params]
-
-    def to_pooling_params(self):
-        return PoolingParams(
-            truncate_prompt_tokens=self.truncate_prompt_tokens,
-            use_activation=get_use_activation(self),
-        )
-
-
-class ClassificationData(OpenAIBaseModel):
-    index: int
-    label: str | None
-    probs: list[float]
-    num_classes: int
-
-
-class ClassificationResponse(OpenAIBaseModel):
-    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
-    object: str = "list"
-    created: int = Field(default_factory=lambda: int(time.time()))
-    model: str
-    data: list[ClassificationData]
-    usage: UsageInfo
-
-
 class FunctionCall(OpenAIBaseModel):
     name: str
     arguments: str
@@ -2433,83 +1778,6 @@ class ResponseInProgressEvent(OpenAIResponseInProgressEvent):
     | ResponseCodeInterpreterCallCompletedEvent
 )
 
-BatchRequestInputBody: TypeAlias = (
-    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
-)
-
-
-class BatchRequestInput(OpenAIBaseModel):
-    """
-    The per-line object of the batch input file.
-
-    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
-    """
-
-    # A developer-provided per-request id that will be used to match outputs to
-    # inputs. Must be unique for each request in a batch.
-    custom_id: str
-
-    # The HTTP method to be used for the request. Currently only POST is
-    # supported.
-    method: str
-
-    # The OpenAI API relative URL to be used for the request. Currently
-    # /v1/chat/completions is supported.
-    url: str
-
-    # The parameters of the request.
-    body: BatchRequestInputBody
-
-    @field_validator("body", mode="plain")
-    @classmethod
-    def check_type_for_url(cls, value: Any, info: ValidationInfo):
-        # Use url to disambiguate models
-        url: str = info.data["url"]
-        if url == "/v1/chat/completions":
-            return ChatCompletionRequest.model_validate(value)
-        if url == "/v1/embeddings":
-            return TypeAdapter(EmbeddingRequest).validate_python(value)
-        if url.endswith("/score"):
-            return ScoreRequest.model_validate(value)
-        if url.endswith("/rerank"):
-            return RerankRequest.model_validate(value)
-        return TypeAdapter(BatchRequestInputBody).validate_python(value)
-
-
-class BatchResponseData(OpenAIBaseModel):
-    # HTTP status code of the response.
-    status_code: int = 200
-
-    # An unique identifier for the API request.
-    request_id: str
-
-    # The body of the response.
-    body: (
-        ChatCompletionResponse
-        | EmbeddingResponse
-        | ScoreResponse
-        | RerankResponse
-        | None
-    ) = None
-
-
-class BatchRequestOutput(OpenAIBaseModel):
-    """
-    The per-line object of the batch output and error files
-    """
-
-    id: str
-
-    # A developer-provided per-request id that will be used to match outputs to
-    # inputs.
-    custom_id: str
-
-    response: BatchResponseData | None
-
-    # For requests that failed with a non-HTTP error, this will contain more
-    # information on the cause of the failure.
-    error: Any | None
-
 
 class TokenizeCompletionRequest(OpenAIBaseModel):
     model: str | None = None
@@ -3110,3 +2378,80 @@ class TranslationResponseVerbose(OpenAIBaseModel):
 
     words: list[TranslationWord] | None = None
     """Extracted words and their corresponding timestamps."""
+
+
+####### Tokens IN <> Tokens OUT #######
+class GenerateRequest(BaseModel):
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    token_ids: list[int]
+    """The token ids to generate text from."""
+
+    # features: MultiModalFeatureSpec
+    # TODO (NickLucche): implement once Renderer work is completed
+    features: str | None = None
+    """The processed MM inputs for the model."""
+
+    sampling_params: SamplingParams
+    """The sampling parameters for the model."""
+
+    model: str | None = None
+
+    stream: bool | None = False
+    stream_options: StreamOptions | None = None
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
+
+
+class GenerateResponseChoice(BaseModel):
+    index: int
+    logprobs: ChatCompletionLogProbs | None = None
+    # per OpenAI spec this is the default
+    finish_reason: str | None = "stop"
+    token_ids: list[int] | None = None
+
+
+class GenerateResponse(BaseModel):
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    choices: list[GenerateResponseChoice]
+
+    prompt_logprobs: list[dict[int, Logprob] | None] | None = None
+
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 4b9dba085a8e..837e742e6be4 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -7,29 +7,35 @@
 from collections.abc import Awaitable, Callable
 from http import HTTPStatus
 from io import StringIO
+from typing import Any, TypeAlias
 
 import aiohttp
 import torch
 from prometheus_client import start_http_server
+from pydantic import TypeAdapter, field_validator
+from pydantic_core.core_schema import ValidationInfo
 from tqdm import tqdm
 
 from vllm.engine.arg_utils import AsyncEngineArgs, optional_type
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    BatchRequestInput,
-    BatchRequestOutput,
-    BatchResponseData,
+    ChatCompletionRequest,
     ChatCompletionResponse,
-    EmbeddingResponse,
     ErrorResponse,
-    RerankResponse,
-    ScoreResponse,
+    OpenAIBaseModel,
 )
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
-from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels
-from vllm.entrypoints.openai.serving_score import ServingScores
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest, EmbeddingResponse
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    RerankResponse,
+    ScoreRequest,
+    ScoreResponse,
+)
+from vllm.entrypoints.pooling.score.serving import ServingScores
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParserManager
 from vllm.utils import random_uuid
@@ -39,6 +45,84 @@
 logger = init_logger(__name__)
 
 
+BatchRequestInputBody: TypeAlias = (
+    ChatCompletionRequest | EmbeddingRequest | ScoreRequest | RerankRequest
+)
+
+
+class BatchRequestInput(OpenAIBaseModel):
+    """
+    The per-line object of the batch input file.
+
+    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
+    """
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs. Must be unique for each request in a batch.
+    custom_id: str
+
+    # The HTTP method to be used for the request. Currently only POST is
+    # supported.
+    method: str
+
+    # The OpenAI API relative URL to be used for the request. Currently
+    # /v1/chat/completions is supported.
+    url: str
+
+    # The parameters of the request.
+    body: BatchRequestInputBody
+
+    @field_validator("body", mode="plain")
+    @classmethod
+    def check_type_for_url(cls, value: Any, info: ValidationInfo):
+        # Use url to disambiguate models
+        url: str = info.data["url"]
+        if url == "/v1/chat/completions":
+            return ChatCompletionRequest.model_validate(value)
+        if url == "/v1/embeddings":
+            return TypeAdapter(EmbeddingRequest).validate_python(value)
+        if url.endswith("/score"):
+            return ScoreRequest.model_validate(value)
+        if url.endswith("/rerank"):
+            return RerankRequest.model_validate(value)
+        return TypeAdapter(BatchRequestInputBody).validate_python(value)
+
+
+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+
+    # An unique identifier for the API request.
+    request_id: str
+
+    # The body of the response.
+    body: (
+        ChatCompletionResponse
+        | EmbeddingResponse
+        | ScoreResponse
+        | RerankResponse
+        | None
+    ) = None
+
+
+class BatchRequestOutput(OpenAIBaseModel):
+    """
+    The per-line object of the batch output and error files
+    """
+
+    id: str
+
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs.
+    custom_id: str
+
+    response: BatchResponseData | None
+
+    # For requests that failed with a non-HTTP error, this will contain more
+    # information on the cause of the failure.
+    error: Any | None
+
+
 def make_arg_parser(parser: FlexibleArgumentParser):
     parser.add_argument(
         "-i",
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 59e1c8d53179..cecd1da1e554 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -55,14 +55,16 @@
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall
+from vllm.entrypoints.openai.utils import maybe_filter_parallel_tool_calls
 from vllm.entrypoints.utils import get_max_tokens, should_include_usage
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
 from vllm.logprobs import Logprob
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
-from vllm.transformers_utils.tokenizers import (
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.mistral import (
+    MistralTokenizer,
     maybe_serialize_tool_calls,
     truncate_tool_call_ids,
     validate_request_params,
@@ -273,6 +275,11 @@ async def create_chat_completion(
         try:
             for i, engine_prompt in enumerate(engine_prompts):
                 prompt_text, _, _ = self._get_prompt_components(request_prompts[i])
+                # If we are creating sub requests for multiple prompts, ensure that they
+                # have unique request ids.
+                sub_request_id = (
+                    request_id if len(engine_prompts) == 1 else f"{request_id}_{i}"
+                )
 
                 if self.default_sampling_params is None:
                     self.default_sampling_params = {}
@@ -301,7 +308,7 @@ async def create_chat_completion(
                     )
 
                 self._log_inputs(
-                    request_id,
+                    sub_request_id,
                     request_prompts[i],
                     params=sampling_params,
                     lora_request=lora_request,
@@ -316,13 +323,14 @@ async def create_chat_completion(
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.beam_search(
                         prompt=engine_prompt,
-                        request_id=request_id,
+                        request_id=sub_request_id,
                         params=sampling_params,
                         lora_request=lora_request,
+                        trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
-                        request_id,
+                        sub_request_id,
                         engine_prompt,
                         sampling_params,
                         lora_request=lora_request,
@@ -333,7 +341,7 @@ async def create_chat_completion(
                     generator = self.engine_client.generate(
                         engine_request,
                         sampling_params,
-                        request_id,
+                        sub_request_id,
                         lora_request=lora_request,
                         trace_headers=trace_headers,
                         priority=request.priority,
@@ -523,7 +531,7 @@ async def chat_completion_stream_generator(
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
         created_time = int(time.time())
@@ -1200,6 +1208,7 @@ async def chat_completion_stream_generator(
 
                         finish_reason_sent[i] = True
 
+                    choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
                     chunk = ChatCompletionStreamResponse(
                         id=request_id,
                         object=chunk_object_type,
@@ -1288,7 +1297,7 @@ async def chat_completion_full_generator(
         request_id: str,
         model_name: str,
         conversation: list[ConversationMessage],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
     ) -> ErrorResponse | ChatCompletionResponse:
         created_time = int(time.time())
@@ -1525,6 +1534,7 @@ async def chat_completion_full_generator(
                     as_list(output.token_ids) if request.return_token_ids else None
                 ),
             )
+            choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
 
             choices.append(choice_data)
 
@@ -1615,7 +1625,7 @@ def _get_top_logprobs(
         self,
         logprobs: dict[int, Logprob],
         top_logprobs: int | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         should_return_as_token_id: bool,
     ) -> list[ChatCompletionLogProb]:
         return [
@@ -1639,7 +1649,7 @@ def _create_chat_logprobs(
         self,
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[dict[int, Logprob] | None],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         num_output_top_logprobs: int | None = None,
         return_as_token_id: bool | None = None,
     ) -> ChatCompletionLogProbs:
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index a114b77ebc16..3e421e21e3e8 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -33,7 +33,7 @@
 from vllm.logprobs import Logprob
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams, SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import merge_async_iterators
 from vllm.utils.collection_utils import as_list
 from vllm.v1.sample.logits_processor import validate_logits_processors_parameters
@@ -216,6 +216,7 @@ async def create_completion(
                         request_id=request_id,
                         params=sampling_params,
                         lora_request=lora_request,
+                        trace_headers=trace_headers,
                     )
                 else:
                     engine_request, tokenization_kwargs = await self._process_inputs(
@@ -249,14 +250,8 @@ async def create_completion(
         model_name = self.models.model_name(lora_request)
         num_prompts = len(engine_prompts)
 
-        # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. Noting that best_of is only supported in V0. In addition,
-        # we do not stream the results when use beam search.
-        stream = (
-            request.stream
-            and (request.best_of is None or request.n == request.best_of)
-            and not request.use_beam_search
-        )
+        # We do not stream the results when using beam search.
+        stream = request.stream and not request.use_beam_search
 
         # Streaming response
         if stream:
@@ -331,7 +326,7 @@ async def completion_stream_generator(
         created_time: int,
         model_name: str,
         num_prompts: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> AsyncGenerator[str, None]:
         num_choices = 1 if request.n is None else request.n
@@ -516,7 +511,7 @@ def request_output_to_completion_response(
         request_id: str,
         created_time: int,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request_metadata: RequestResponseMetadata,
     ) -> CompletionResponse:
         choices: list[CompletionResponseChoice] = []
@@ -627,7 +622,7 @@ def _create_completion_logprobs(
         token_ids: GenericSequence[int],
         top_logprobs: GenericSequence[dict[int, Logprob] | None],
         num_output_top_logprobs: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         initial_text_offset: int = 0,
         return_as_token_id: bool | None = None,
     ) -> CompletionLogProbs:
@@ -647,9 +642,15 @@ def _create_completion_logprobs(
         for i, token_id in enumerate(token_ids):
             step_top_logprobs = top_logprobs[i]
             if step_top_logprobs is None:
-                token = tokenizer.decode(token_id)
                 if should_return_as_token_id:
                     token = f"token_id:{token_id}"
+                else:
+                    if tokenizer is None:
+                        raise ValueError(
+                            "Unable to get tokenizer because `skip_tokenizer_init=True`"
+                        )
+
+                    token = tokenizer.decode(token_id)
 
                 out_tokens.append(token)
                 out_token_logprobs.append(None)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 1456727a3cdd..1d89aa011af2 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -7,15 +7,39 @@
 import traceback
 from collections.abc import AsyncGenerator, Callable, Iterable, Mapping, Sequence
 from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass, field
 from http import HTTPStatus
 from typing import Any, ClassVar, Generic, TypeAlias, TypeVar
 
+import numpy as np
 import torch
 from fastapi import Request
-from pydantic import BaseModel, ConfigDict, Field, TypeAdapter
+from pydantic import ConfigDict, TypeAdapter
 from starlette.datastructures import Headers
 from typing_extensions import TypeIs
 
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+)
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorRequest,
+    PoolingResponse,
+)
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    ScoreRequest,
+    ScoreResponse,
+)
+
 if sys.version_info >= (3, 12):
     from typing import TypedDict
 else:
@@ -43,25 +67,16 @@
     ChatCompletionNamedToolChoiceParam,
     ChatCompletionRequest,
     ChatCompletionResponse,
-    ClassificationRequest,
-    ClassificationResponse,
     CompletionRequest,
     CompletionResponse,
     DetokenizeRequest,
-    EmbeddingChatRequest,
-    EmbeddingCompletionRequest,
-    EmbeddingRequest,
-    EmbeddingResponse,
     ErrorInfo,
     ErrorResponse,
     FunctionCall,
     FunctionDefinition,
-    IOProcessorRequest,
-    PoolingResponse,
-    RerankRequest,
+    GenerateRequest,
+    GenerateResponse,
     ResponsesRequest,
-    ScoreRequest,
-    ScoreResponse,
     TokenizeChatRequest,
     TokenizeCompletionRequest,
     TokenizeResponse,
@@ -91,12 +106,12 @@
 from vllm.pooling_params import PoolingParams
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import BeamSearchParams, SamplingParams
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.tracing import (
     contains_trace_headers,
     extract_trace_headers,
     log_tracing_disabled_warning,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import random_uuid
 from vllm.utils.async_utils import (
     AsyncMicrobatchTokenizer,
@@ -114,13 +129,16 @@
     | DetokenizeRequest
     | EmbeddingCompletionRequest
     | RerankRequest
-    | ClassificationRequest
+    | ClassificationCompletionRequest
     | ScoreRequest
     | TokenizeCompletionRequest
 )
 
 ChatLikeRequest: TypeAlias = (
-    ChatCompletionRequest | EmbeddingChatRequest | TokenizeChatRequest
+    ChatCompletionRequest
+    | EmbeddingChatRequest
+    | TokenizeChatRequest
+    | ClassificationChatRequest
 )
 SpeechToTextRequest: TypeAlias = TranscriptionRequest | TranslationRequest
 AnyRequest: TypeAlias = (
@@ -129,6 +147,7 @@
     | SpeechToTextRequest
     | ResponsesRequest
     | IOProcessorRequest
+    | GenerateRequest
 )
 
 AnyResponse: TypeAlias = (
@@ -140,6 +159,7 @@
     | PoolingResponse
     | ClassificationResponse
     | ScoreResponse
+    | GenerateResponse
 )
 
 
@@ -174,19 +194,19 @@ def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
 RequestT = TypeVar("RequestT", bound=AnyRequest)
 
 
-class RequestProcessingMixin(BaseModel):
+@dataclass(kw_only=True)
+class RequestProcessingMixin:
     """
     Mixin for request processing,
     handling prompt preparation and engine input.
     """
 
-    request_prompts: Sequence[RequestPrompt] | None = []
-    engine_prompts: list[EngineTokensPrompt] | None = []
+    request_prompts: Sequence[RequestPrompt] | None = field(default_factory=list)
+    engine_prompts: list[EngineTokensPrompt] | None = field(default_factory=list)
 
-    model_config = ConfigDict(arbitrary_types_allowed=True)
 
-
-class ResponseGenerationMixin(BaseModel):
+@dataclass(kw_only=True)
+class ResponseGenerationMixin:
     """
     Mixin for response generation,
     managing result generators and final batch results.
@@ -195,54 +215,38 @@ class ResponseGenerationMixin(BaseModel):
     result_generator: (
         AsyncGenerator[tuple[int, RequestOutput | PoolingRequestOutput], None] | None
     ) = None
-    final_res_batch: list[RequestOutput | PoolingRequestOutput] = Field(
+    final_res_batch: list[RequestOutput | PoolingRequestOutput] = field(
         default_factory=list
     )
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
 
-class ServeContext(
-    RequestProcessingMixin,
-    ResponseGenerationMixin,
-    BaseModel,
-    Generic[RequestT],
-):
+@dataclass(kw_only=True)
+class ServeContext(RequestProcessingMixin, ResponseGenerationMixin, Generic[RequestT]):
     # Shared across all requests
     request: RequestT
     raw_request: Request | None = None
     model_name: str
     request_id: str
-    created_time: int = Field(default_factory=lambda: int(time.time()))
+    created_time: int = field(default_factory=lambda: int(time.time()))
     lora_request: LoRARequest | None = None
 
     # Shared across most requests
-    tokenizer: AnyTokenizer | None = None
-
-    # `protected_namespaces` resolves Pydantic v2's warning
-    # on conflict with protected namespace "model_"
-    model_config = ConfigDict(
-        protected_namespaces=(),
-        arbitrary_types_allowed=True,
-    )
+    tokenizer: TokenizerLike | None = None
 
 
-ClassificationServeContext = ServeContext[ClassificationRequest]
+@dataclass(kw_only=True)
+class ClassificationServeContext(ServeContext[ClassificationRequest]):
+    pass
 
 
+@dataclass(kw_only=True)
 class EmbeddingServeContext(ServeContext[EmbeddingRequest]):
     chat_template: str | None = None
     chat_template_content_format: ChatTemplateContentFormatOption
 
 
-# Used to resolve the Pydantic error related to
-# forward reference of MultiModalDataDict in TokensPrompt
-RequestProcessingMixin.model_rebuild()
-ServeContext.model_rebuild()
-ClassificationServeContext.model_rebuild()
-EmbeddingServeContext.model_rebuild()
-
-
 class OpenAIServing:
     request_id_prefix: ClassVar[str] = """
     A short string prepended to every request’s ID (e.g. "embd", "classify")
@@ -271,26 +275,22 @@ def __init__(
             apply_mistral_chat_template, executor=self._tokenizer_executor
         )
 
-        self._async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer] = {}
+        self._async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer] = {}
         self.log_error_stack = log_error_stack
 
-        self.processor = self.models.processor
+        self.input_processor = self.models.input_processor
         self.io_processor = self.models.io_processor
         self.model_config = self.models.model_config
         self.max_model_len = self.model_config.max_model_len
 
     def _get_tool_parser(
         self, tool_parser_name: str | None = None, enable_auto_tools: bool = False
-    ) -> Callable[[AnyTokenizer], ToolParser] | None:
+    ) -> Callable[[TokenizerLike], ToolParser] | None:
         """Get the tool parser based on the name."""
         parser = None
         if not enable_auto_tools or tool_parser_name is None:
             return parser
-        logger.info(
-            '"auto" tool choice has been enabled please note that while'
-            " the parallel_tool_calls client option is preset for "
-            "compatibility reasons, it will be ignored."
-        )
+        logger.info('"auto" tool choice has been enabled.')
 
         try:
             if tool_parser_name == "pythonic" and self.model_config.model.startswith(
@@ -311,7 +311,7 @@ def _get_tool_parser(
     def _get_reasoning_parser(
         self,
         reasoning_parser_name: str,
-    ) -> Callable[[AnyTokenizer], ReasoningParser] | None:
+    ) -> Callable[[TokenizerLike], ReasoningParser] | None:
         """Get the reasoning parser based on the name."""
         parser = None
         if not reasoning_parser_name:
@@ -324,7 +324,7 @@ def _get_reasoning_parser(
         return parser
 
     async def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         await self.engine_client.reset_mm_cache()
 
     async def beam_search(
@@ -333,6 +333,7 @@ async def beam_search(
         request_id: str,
         params: BeamSearchParams,
         lora_request: LoRARequest | None = None,
+        trace_headers: Mapping[str, str] | None = None,
     ) -> AsyncGenerator[RequestOutput, None]:
         beam_width = params.beam_width
         max_tokens = params.max_tokens
@@ -341,11 +342,11 @@ async def beam_search(
         length_penalty = params.length_penalty
         include_stop_str_in_output = params.include_stop_str_in_output
 
-        processor = self.processor
-        tokenizer = processor.tokenizer
+        input_processor = self.input_processor
+        tokenizer = input_processor.tokenizer
         if tokenizer is None:
             raise ValueError(
-                "You cannot use beam search when `skip_tokenizer_init` is True"
+                "You cannot use beam search when `skip_tokenizer_init=True`"
             )
 
         eos_token_id: int = tokenizer.eos_token_id  # type: ignore
@@ -380,8 +381,9 @@ async def beam_search(
 
         sort_beams_key = create_sort_beams_key_function(eos_token_id, length_penalty)
 
+        logprobs_num = 2 * beam_width
         beam_search_params = SamplingParams(
-            logprobs=2 * beam_width,
+            logprobs=logprobs_num,
             max_tokens=1,
             temperature=temperature,
         )
@@ -426,6 +428,7 @@ async def beam_search(
                             beam_search_params,
                             request_id_item,
                             lora_request=lora_req,
+                            trace_headers=trace_headers,
                         )
                     )
                 )
@@ -434,40 +437,75 @@ async def beam_search(
             output = [x[0] for x in await asyncio.gather(*tasks)]
 
             new_beams = []
-            for i, current_beam in enumerate(all_beams):
-                result = output[i]
-
+            # Store all new tokens generated by beam
+            all_beams_token_id = []
+            # Store the cumulative probability of all tokens
+            # generated by beam search
+            all_beams_logprob = []
+            # Iterate through all beam inference results
+            for i, result in enumerate(output):
+                current_beam = all_beams[i]
                 if result.outputs[0].logprobs is not None:
                     logprobs = result.outputs[0].logprobs[0]
-                    for token_id, logprob_obj in logprobs.items():
-                        if token_id == eos_token_id and not ignore_eos:
-                            completed.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens + [token_id]
-                                    if include_stop_str_in_output
-                                    else current_beam.tokens,
-                                    logprobs=current_beam.logprobs + [logprobs],
-                                    cum_logprob=current_beam.cum_logprob
-                                    + logprob_obj.logprob,
-                                    finish_reason="stop",
-                                    stop_reason=eos_token_id,
-                                )
-                            )
-                        else:
-                            new_beams.append(
-                                BeamSearchSequence(
-                                    tokens=current_beam.tokens + [token_id],
-                                    logprobs=current_beam.logprobs + [logprobs],
-                                    lora_request=current_beam.lora_request,
-                                    cum_logprob=current_beam.cum_logprob
-                                    + logprob_obj.logprob,
-                                    multi_modal_data=current_beam.multi_modal_data,
-                                    mm_processor_kwargs=current_beam.mm_processor_kwargs,
-                                )
-                            )
-
-            sorted_beams = sorted(new_beams, key=sort_beams_key, reverse=True)
-            all_beams = sorted_beams[:beam_width]
+                    all_beams_token_id.extend(list(logprobs.keys()))
+                    all_beams_logprob.extend(
+                        [
+                            current_beam.cum_logprob + obj.logprob
+                            for obj in logprobs.values()
+                        ]
+                    )
+
+            # Handle the token for the end of sentence (EOS)
+            all_beams_token_id = np.array(all_beams_token_id)
+            all_beams_logprob = np.array(all_beams_logprob)
+
+            if not ignore_eos:
+                # Get the index position of eos token in all generated results
+                eos_idx = np.where(all_beams_token_id == eos_token_id)[0]
+                for idx in eos_idx:
+                    current_beam = all_beams[idx // logprobs_num]
+                    result = output[idx // logprobs_num]
+                    assert result.outputs[0].logprobs is not None
+                    logprobs_entry = result.outputs[0].logprobs[0]
+                    completed.append(
+                        BeamSearchSequence(
+                            tokens=current_beam.tokens + [eos_token_id]
+                            if include_stop_str_in_output
+                            else current_beam.tokens,
+                            logprobs=current_beam.logprobs + [logprobs_entry],
+                            cum_logprob=float(all_beams_logprob[idx]),
+                            finish_reason="stop",
+                            stop_reason=eos_token_id,
+                        )
+                    )
+                # After processing, set the log probability of the eos condition
+                # to negative infinity.
+                all_beams_logprob[eos_idx] = -np.inf
+
+            # Processing non-EOS tokens
+            # Get indices of the top beam_width probabilities
+            topn_idx = np.argpartition(np.negative(all_beams_logprob), beam_width)[
+                :beam_width
+            ]
+
+            for idx in topn_idx:
+                current_beam = all_beams[idx // logprobs_num]
+                result = output[idx // logprobs_num]
+                token_id = int(all_beams_token_id[idx])
+                assert result.outputs[0].logprobs is not None
+                logprobs_entry = result.outputs[0].logprobs[0]
+                new_beams.append(
+                    BeamSearchSequence(
+                        tokens=current_beam.tokens + [token_id],
+                        logprobs=current_beam.logprobs + [logprobs_entry],
+                        lora_request=current_beam.lora_request,
+                        cum_logprob=float(all_beams_logprob[idx]),
+                        multi_modal_data=current_beam.multi_modal_data,
+                        mm_processor_kwargs=current_beam.mm_processor_kwargs,
+                    )
+                )
+
+            all_beams = new_beams
 
         completed.extend(all_beams)
         sorted_completed = sorted(completed, key=sort_beams_key, reverse=True)
@@ -503,7 +541,7 @@ async def beam_search(
             prompt_logprobs=None,
         )
 
-    def _get_renderer(self, tokenizer: AnyTokenizer | None) -> BaseRenderer:
+    def _get_renderer(self, tokenizer: TokenizerLike | None) -> BaseRenderer:
         """
         Get a Renderer instance with the provided tokenizer.
         Uses shared async tokenizer pool for efficiency.
@@ -814,7 +852,11 @@ def _get_message_types(self, request: AnyRequest) -> set[str]:
         if not hasattr(request, "messages"):
             return message_types
 
-        for message in request.messages:
+        messages = request.messages
+        if messages is None or isinstance(messages, (str, bytes)):
+            return message_types
+
+        for message in messages:
             if (
                 isinstance(message, dict)
                 and "content" in message
@@ -829,7 +871,7 @@ async def _normalize_prompt_text_to_input(
         self,
         request: AnyRequest,
         prompt: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         add_special_tokens: bool,
     ) -> TextTokensPrompt:
         async_tokenizer = self._get_async_tokenizer(tokenizer)
@@ -871,7 +913,7 @@ async def _normalize_prompt_tokens_to_input(
         self,
         request: AnyRequest,
         prompt_ids: list[int],
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
     ) -> TextTokensPrompt:
         truncate_prompt_tokens = getattr(request, "truncate_prompt_tokens", None)
 
@@ -907,7 +949,8 @@ def _validate_input(
                 EmbeddingCompletionRequest,
                 ScoreRequest,
                 RerankRequest,
-                ClassificationRequest,
+                ClassificationCompletionRequest,
+                ClassificationChatRequest,
             ),
         ):
             # Note: input length can be up to the entire model context length
@@ -915,7 +958,8 @@ def _validate_input(
             if token_num > self.max_model_len:
                 operations: dict[type[AnyRequest], str] = {
                     ScoreRequest: "score",
-                    ClassificationRequest: "classification",
+                    ClassificationCompletionRequest: "classification",
+                    ClassificationChatRequest: "classification",
                 }
                 operation = operations.get(type(request), "embedding generation")
                 raise ValueError(
@@ -965,7 +1009,7 @@ def _validate_input(
     async def _tokenize_prompt_input_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt_input: str | list[int],
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
@@ -984,7 +1028,7 @@ async def _tokenize_prompt_input_async(
     async def _tokenize_prompt_inputs_async(
         self,
         request: AnyRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         prompt_inputs: Iterable[str | list[int]],
         add_special_tokens: bool = True,
     ) -> AsyncGenerator[TextTokensPrompt, None]:
@@ -1029,7 +1073,7 @@ def _validate_chat_template(
     async def _preprocess_chat(
         self,
         request: ChatLikeRequest | ResponsesRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         messages: list[ChatCompletionMessageParam],
         chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
@@ -1038,13 +1082,18 @@ async def _preprocess_chat(
         tool_dicts: list[dict[str, Any]] | None = None,
         documents: list[dict[str, str]] | None = None,
         chat_template_kwargs: dict[str, Any] | None = None,
-        tool_parser: Callable[[AnyTokenizer], ToolParser] | None = None,
+        tool_parser: Callable[[TokenizerLike], ToolParser] | None = None,
         add_special_tokens: bool = False,
     ) -> tuple[
         list[ConversationMessage],
         Sequence[RequestPrompt],
         list[EngineTokensPrompt],
     ]:
+        if tokenizer is None:
+            raise ValueError(
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
+            )
+
         model_config = self.model_config
 
         resolved_content_format = resolve_chat_template_content_format(
@@ -1164,7 +1213,7 @@ async def _process_inputs(
             self.max_model_len, params.truncate_prompt_tokens, tokenization_kwargs
         )
 
-        engine_request = self.processor.process_inputs(
+        engine_request = self.input_processor.process_inputs(
             request_id,
             engine_prompt,
             params,
@@ -1188,16 +1237,19 @@ async def _generate_with_builtin_tools(
     ):
         prompt_text, _, _ = self._get_prompt_components(request_prompt)
         orig_priority = priority
+        sub_request = 0
         while True:
+            # Ensure that each sub-request has a unique request id.
+            sub_request_id = f"{request_id}_{sub_request}"
             self._log_inputs(
-                request_id,
+                sub_request_id,
                 request_prompt,
                 params=sampling_params,
                 lora_request=lora_request,
             )
             trace_headers = kwargs.get("trace_headers")
             engine_request, tokenization_kwargs = await self._process_inputs(
-                request_id,
+                sub_request_id,
                 engine_prompt,
                 sampling_params,
                 lora_request=lora_request,
@@ -1208,7 +1260,7 @@ async def _generate_with_builtin_tools(
             generator = self.engine_client.generate(
                 engine_request,
                 sampling_params,
-                request_id,
+                sub_request_id,
                 lora_request=lora_request,
                 priority=priority,
                 prompt_text=prompt_text,
@@ -1241,6 +1293,7 @@ async def _generate_with_builtin_tools(
             sampling_params.max_tokens = self.max_model_len - len(prompt_token_ids)
             # OPTIMIZATION
             priority = orig_priority - 1
+            sub_request += 1
 
     def _get_prompt_components(
         self,
@@ -1291,11 +1344,12 @@ def _base_request_id(
         raw_request: Request | None, default: str | None = None
     ) -> str | None:
         """Pulls the request id to use from a header, if provided"""
-        default = default or random_uuid()
-        if raw_request is None:
-            return default
+        if raw_request is not None and (
+            (req_id := raw_request.headers.get("X-Request-Id")) is not None
+        ):
+            return req_id
 
-        return raw_request.headers.get("X-Request-Id", default)
+        return random_uuid() if default is None else default
 
     @staticmethod
     def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
@@ -1315,9 +1369,9 @@ def _get_data_parallel_rank(raw_request: Request | None) -> int | None:
     @staticmethod
     def _parse_tool_calls_from_content(
         request: ResponsesRequest | ChatCompletionRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         enable_auto_tools: bool,
-        tool_parser_cls: Callable[[AnyTokenizer], ToolParser] | None,
+        tool_parser_cls: Callable[[TokenizerLike], ToolParser] | None,
         content: str | None = None,
     ) -> tuple[list[FunctionCall] | None, str | None]:
         function_calls = list[FunctionCall]()
@@ -1387,7 +1441,7 @@ def _parse_tool_calls_from_content(
     def _get_decoded_token(
         logprob: Logprob,
         token_id: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         return_as_token_id: bool = False,
     ) -> str:
         if return_as_token_id:
@@ -1395,6 +1449,12 @@ def _get_decoded_token(
 
         if logprob.decoded_token is not None:
             return logprob.decoded_token
+
+        if tokenizer is None:
+            raise ValueError(
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
+            )
+
         return tokenizer.decode(token_id)
 
     def _is_model_supported(self, model_name: str | None) -> bool:
diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py
index 24b9587010ca..953398a9a72a 100644
--- a/vllm/entrypoints/openai/serving_models.py
+++ b/vllm/entrypoints/openai/serving_models.py
@@ -69,7 +69,7 @@ def __init__(
             )
         self.lora_resolver_lock: dict[str, Lock] = defaultdict(Lock)
 
-        self.processor = self.engine_client.processor
+        self.input_processor = self.engine_client.input_processor
         self.io_processor = self.engine_client.io_processor
         self.model_config = self.engine_client.model_config
         self.max_model_len = self.model_config.max_model_len
@@ -150,7 +150,7 @@ async def load_lora_adapter(
                 lora_request.base_model_name = base_model_name
 
             # Validate that the adapter can be loaded into the engine
-            # This will also pre-load it for incoming requests
+            # This will also preload it for incoming requests
             try:
                 await self.engine_client.add_lora(lora_request)
             except Exception as e:
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 06efb43ecb7b..81495a077754 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -48,6 +48,7 @@
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent,
 )
+from openai.types.responses.tool import Mcp, Tool
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
@@ -94,7 +95,7 @@
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.responses_utils import (
-    construct_chat_message_with_tool_call,
+    construct_input_messages,
     convert_tool_responses_to_completions_format,
     extract_tool_types,
 )
@@ -105,12 +106,51 @@
 from vllm.logprobs import SampleLogprobs
 from vllm.outputs import CompletionOutput
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
 
+def _extract_allowed_tools_from_mcp_requests(
+    tools: list[Tool],
+) -> dict[str, list[str] | None]:
+    """
+    Extract allowed_tools mapping from MCP tool requests.
+
+    Returns a dictionary mapping server_label to allowed_tools list.
+    Handles both list format and McpAllowedToolsMcpToolFilter object format.
+
+    Special handling:
+    - If allowed_tools is None, returns None (allows all tools)
+    - If allowed_tools contains "*", returns None (allows all tools)
+    - Otherwise, returns the list of specific tool names
+
+    This function can be reused for both harmony and non-harmony MCP calls.
+    """
+    allowed_tools_map: dict[str, list[str] | None] = {}
+    for tool in tools:
+        if not isinstance(tool, Mcp):
+            continue
+
+        # allowed_tools can be a list or an object with tool_names
+        # Extract the actual list of tool names
+        allowed_tools_val = None
+        if tool.allowed_tools is not None:
+            if isinstance(tool.allowed_tools, list):
+                allowed_tools_val = tool.allowed_tools
+            elif hasattr(tool.allowed_tools, "tool_names"):
+                # It's an McpAllowedToolsMcpToolFilter object
+                allowed_tools_val = tool.allowed_tools.tool_names
+
+        # Normalize "*" to None (both mean "allow all tools")
+        if allowed_tools_val is not None and "*" in allowed_tools_val:
+            allowed_tools_val = None
+
+        allowed_tools_map[tool.server_label] = allowed_tools_val
+    return allowed_tools_map
+
+
 class OpenAIServingResponses(OpenAIServing):
     def __init__(
         self,
@@ -492,7 +532,7 @@ async def _make_request(
         self,
         request: ResponsesRequest,
         prev_response: ResponsesResponse | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ):
         if request.tools is None or (
             request.tool_choice == "none" and self.exclude_tools_when_tool_choice_none
@@ -504,7 +544,12 @@ async def _make_request(
                 for tool in request.tools
             ]
         # Construct the input messages.
-        messages = self._construct_input_messages(request, prev_response)
+        messages = construct_input_messages(
+            request_instructions=request.instructions,
+            request_input=request.input,
+            prev_msg=self.msg_store.get(prev_response.id) if prev_response else None,
+            prev_response_output=prev_response.output if prev_response else None,
+        )
         _, request_prompts, engine_prompts = await self._preprocess_chat(
             request,
             tokenizer,
@@ -558,7 +603,7 @@ async def responses_full_generator(
         result_generator: AsyncIterator[ConversationContext],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int | None = None,
     ) -> ErrorResponse | ResponsesResponse:
@@ -670,7 +715,7 @@ def _topk_logprobs(
         self,
         logprobs: dict[int, SampleLogprob],
         top_logprobs: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> list[LogprobTopLogprob]:
         """Returns the top-k logprobs from the logprobs dictionary."""
         out = []
@@ -695,7 +740,7 @@ def _create_response_logprobs(
         self,
         token_ids: Sequence[int],
         logprobs: SampleLogprobs | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         top_logprobs: int | None = None,
     ) -> list[Logprob]:
         assert logprobs is not None, "logprobs must be provided"
@@ -731,7 +776,7 @@ def _create_stream_response_logprobs(
         self,
         token_ids: Sequence[int],
         logprobs: SampleLogprobs | None,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         top_logprobs: int | None = None,
     ) -> list[response_text_delta_event.Logprob]:
         lgs = self._create_response_logprobs(
@@ -758,7 +803,7 @@ def _make_response_output_items(
         self,
         request: ResponsesRequest,
         final_output: CompletionOutput,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> list[ResponseOutputItem]:
         if self.reasoning_parser:
             try:
@@ -869,83 +914,49 @@ def _make_response_output_items_with_harmony(
             output_items.extend(last_items)
         return output_items
 
-    def _construct_input_messages(
-        self,
-        request: ResponsesRequest,
-        prev_response: ResponsesResponse | None = None,
-    ) -> list[ChatCompletionMessageParam]:
-        messages: list[ChatCompletionMessageParam] = []
-        if request.instructions:
-            messages.append(
-                {
-                    "role": "system",
-                    "content": request.instructions,
-                }
-            )
-
-        # Prepend the conversation history.
-        if prev_response is not None:
-            # Add the previous messages.
-            prev_msg = self.msg_store[prev_response.id]
-            messages.extend(prev_msg)
-
-            # Add the previous output.
-            for output_item in prev_response.output:
-                # NOTE: We skip the reasoning output.
-                if isinstance(output_item, ResponseOutputMessage):
-                    for content in output_item.content:
-                        messages.append(
-                            {
-                                "role": "assistant",
-                                "content": content.text,
-                            }
-                        )
-
-        # Append the new input.
-        # Responses API supports simple text inputs without chat format.
-        if isinstance(request.input, str):
-            messages.append({"role": "user", "content": request.input})
-        else:
-            for item in request.input:
-                messages.append(construct_chat_message_with_tool_call(item))
-        return messages
-
     def _construct_harmony_system_input_message(
         self, request: ResponsesRequest, with_custom_tools: bool, tool_types: set[str]
     ) -> OpenAIHarmonyMessage:
         reasoning_effort = request.reasoning.effort if request.reasoning else None
-        enable_browser = (
-            "web_search_preview" in tool_types
+
+        # Extract allowed_tools from MCP tool requests
+        allowed_tools_map = _extract_allowed_tools_from_mcp_requests(request.tools)
+
+        # Get filtered tool descriptions first.
+        # If get_tool_description returns None (due to filtering), the tool is disabled.
+        browser_description = (
+            self.tool_server.get_tool_description(
+                "browser", allowed_tools_map.get("web_search_preview")
+            )
+            if "web_search_preview" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("browser")
+            else None
         )
-        enable_code_interpreter = (
-            "code_interpreter" in tool_types
+        python_description = (
+            self.tool_server.get_tool_description(
+                "python", allowed_tools_map.get("code_interpreter")
+            )
+            if "code_interpreter" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("python")
+            else None
         )
-        enable_container = (
-            "container" in tool_types
+        container_description = (
+            self.tool_server.get_tool_description(
+                "container", allowed_tools_map.get("container")
+            )
+            if "container" in tool_types
             and self.tool_server is not None
             and self.tool_server.has_tool("container")
+            else None
         )
+
         sys_msg = get_system_message(
             reasoning_effort=reasoning_effort,
-            browser_description=(
-                self.tool_server.get_tool_description("browser")
-                if enable_browser and self.tool_server is not None
-                else None
-            ),
-            python_description=(
-                self.tool_server.get_tool_description("python")
-                if enable_code_interpreter and self.tool_server is not None
-                else None
-            ),
-            container_description=(
-                self.tool_server.get_tool_description("container")
-                if enable_container and self.tool_server is not None
-                else None
-            ),
+            browser_description=browser_description,
+            python_description=python_description,
+            container_description=container_description,
             instructions=request.instructions,
             with_custom_tools=with_custom_tools,
         )
@@ -1171,7 +1182,7 @@ async def _process_simple_streaming_events(
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
@@ -1474,7 +1485,7 @@ async def _process_harmony_streaming_events(
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int,
         _increment_sequence_number_and_return: Callable[
@@ -1927,7 +1938,7 @@ async def responses_stream_generator(
         result_generator: AsyncIterator[ConversationContext | None],
         context: ConversationContext,
         model_name: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         request_metadata: RequestResponseMetadata,
         created_time: int | None = None,
     ) -> AsyncGenerator[StreamingResponsesResponse, None]:
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 39aae0cd0495..979da02d1450 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -22,7 +22,7 @@
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -170,7 +170,7 @@ def _build_render_config(self, request: TokenizeRequest) -> RenderConfig:
 
 @dataclass
 class TokenizerInfo:
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     chat_template: str | None
 
     def to_dict(self) -> dict[str, Any]:
diff --git a/vllm/entrypoints/openai/serving_tokens.py b/vllm/entrypoints/openai/serving_tokens.py
new file mode 100644
index 000000000000..daa739e41fa0
--- /dev/null
+++ b/vllm/entrypoints/openai/serving_tokens.py
@@ -0,0 +1,281 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import asyncio
+import time
+from collections.abc import AsyncGenerator
+from collections.abc import Sequence as GenericSequence
+
+from fastapi import Request
+
+from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.logger import RequestLogger
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionLogProb,
+    ChatCompletionLogProbs,
+    ChatCompletionLogProbsContent,
+    ErrorResponse,
+    GenerateRequest,
+    GenerateResponse,
+    GenerateResponseChoice,
+    PromptTokenUsageInfo,
+    RequestResponseMetadata,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
+from vllm.logger import init_logger
+from vllm.logprobs import Logprob
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils.collection_utils import as_list
+
+logger = init_logger(__name__)
+
+
+class ServingTokens(OpenAIServing):
+    """Provides Tokens IN <> Tokens OUT functionality to vLLM API."""
+
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        models: OpenAIServingModels,
+        *,
+        request_logger: RequestLogger | None,
+        force_no_detokenize: bool = False,
+        return_tokens_as_token_ids: bool = False,
+        log_error_stack: bool = False,
+        enable_prompt_tokens_details: bool = False,
+        enable_log_outputs: bool = False,
+    ):
+        super().__init__(
+            engine_client=engine_client,
+            models=models,
+            request_logger=request_logger,
+            return_tokens_as_token_ids=return_tokens_as_token_ids,
+            log_error_stack=log_error_stack,
+        )
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+        self.enable_log_outputs = enable_log_outputs
+        self.force_no_detokenize = force_no_detokenize
+        if force_no_detokenize:
+            logger.info(
+                "Tokens-only mode is enabled, skipping detokenization "
+                "step for incoming requests."
+            )
+
+    async def serve_tokens(
+        self,
+        request: GenerateRequest,
+        raw_request: Request | None = None,
+    ) -> GenerateResponse | ErrorResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            logger.error("Error with model %s", error_check_ret)
+            return error_check_ret
+
+        # If the engine is dead, raise the engine's DEAD_ERROR.
+        # This is required for the streaming case, where we return a
+        # success status before we actually start generating text :).
+        if self.engine_client.errored:
+            raise self.engine_client.dead_error
+
+        lora_request = None
+        lora_request = self._maybe_get_adapters(request, supports_default_mm_loras=True)
+
+        model_name = self.models.model_name(lora_request)
+
+        request_id = (
+            f"generate-tokens-{self._base_request_id(raw_request, request.request_id)}"
+        )
+
+        request_metadata = RequestResponseMetadata(request_id=request_id)
+        if raw_request:
+            raw_request.state.request_metadata = request_metadata
+
+        # TODO(NickLucche): Change to EngineCoreRequest once Renderer work is
+        # completed
+        engine_prompt = EngineTokensPrompt(prompt_token_ids=request.token_ids)
+        if request.features is not None:
+            engine_prompt["multi_modal_data"] = None
+
+        if hasattr(request, "cache_salt") and request.cache_salt is not None:
+            engine_prompt["cache_salt"] = request.cache_salt
+
+        # Schedule the request and get the result generator.
+        result_generator: AsyncGenerator[RequestOutput, None] | None = None
+        try:
+            sampling_params = request.sampling_params
+            if self.force_no_detokenize:
+                sampling_params.detokenize = False
+
+            self._log_inputs(
+                request_id,
+                request.token_ids,
+                params=sampling_params,
+                lora_request=lora_request,
+            )
+
+            trace_headers = (
+                None
+                if raw_request is None
+                else await self._get_trace_headers(raw_request.headers)
+            )
+
+            result_generator = self.engine_client.generate(
+                engine_prompt,
+                sampling_params,
+                request_id,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                priority=request.priority,
+            )
+
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        # TODO(NickLucche): Implement streaming response
+
+        try:
+            assert result_generator is not None
+            return await self.serve_tokens_full_generator(
+                request, result_generator, request_id, model_name, request_metadata
+            )
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+    async def serve_tokens_full_generator(
+        self,
+        request: GenerateRequest,
+        result_generator: AsyncGenerator[RequestOutput, None],
+        request_id: str,
+        model_name: str,
+        request_metadata: RequestResponseMetadata,
+    ) -> ErrorResponse | GenerateResponse:
+        created_time = int(time.time())
+        final_res: RequestOutput | None = None
+        sampling_params: SamplingParams = request.sampling_params
+
+        try:
+            async for res in result_generator:
+                final_res = res
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
+        except ValueError as e:
+            return self.create_error_response(str(e))
+
+        assert final_res is not None
+
+        choices: list[GenerateResponseChoice] = []
+        num_generated_tokens = 0
+        for output in final_res.outputs:
+            token_ids = output.token_ids
+            out_logprobs = output.logprobs
+
+            # This is top_logprobs in completions API
+            if sampling_params.logprobs:
+                assert out_logprobs is not None, "Did not output logprobs"
+                logprobs = self._create_tokens_logprobs(
+                    token_ids=token_ids,
+                    top_logprobs=out_logprobs,
+                    num_output_top_logprobs=sampling_params.logprobs,
+                )
+            else:
+                logprobs = None
+
+            choice_data = GenerateResponseChoice(
+                index=output.index,
+                logprobs=logprobs,
+                finish_reason=output.finish_reason if output.finish_reason else "stop",
+                token_ids=as_list(output.token_ids),
+            )
+
+            choices.append(choice_data)
+            num_generated_tokens += len(output.token_ids)
+
+        assert final_res.prompt_token_ids is not None
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        if final_res.encoder_prompt_token_ids is not None:
+            num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
+
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            # This info is not available at the /coordinator level
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens
+            )
+
+        request_metadata.final_usage_info = usage
+
+        response = GenerateResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+            prompt_logprobs=clamp_prompt_logprobs(final_res.prompt_logprobs),
+            kv_transfer_params=final_res.kv_transfer_params,
+        )
+
+        # Log complete response if output logging is enabled
+        if self.enable_log_outputs and self.request_logger:
+            for choice in choices:
+                # Get the corresponding output token IDs
+                output_token_ids = None
+                if choice.index < len(final_res.outputs):
+                    output_token_ids = final_res.outputs[choice.index].token_ids
+
+                if output_token_ids:
+                    # Log token_ids only.
+                    self.request_logger.log_outputs(
+                        request_id=request_id,
+                        outputs="",
+                        output_token_ids=output_token_ids,
+                        finish_reason=choice.finish_reason,
+                        is_streaming=False,
+                        delta=False,
+                    )
+
+        return response
+
+    def _create_tokens_logprobs(
+        self,
+        token_ids: GenericSequence[int],
+        top_logprobs: GenericSequence[dict[int, Logprob] | None],
+        num_output_top_logprobs: int | None = None,
+    ) -> ChatCompletionLogProbs:
+        """Create OpenAI-style logprobs."""
+        logprobs_content: list[ChatCompletionLogProbsContent] = []
+
+        for i, token_id in enumerate(token_ids):
+            token = f"token_id:{token_id}"
+            step_top_logprobs = top_logprobs[i]
+            if step_top_logprobs is None or step_top_logprobs.get(token_id) is None:
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                    )
+                )
+            else:
+                step_token = step_top_logprobs[token_id]
+
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=token,
+                        logprob=max(step_token.logprob, -9999.0),
+                        top_logprobs=[
+                            ChatCompletionLogProb(
+                                token=token,
+                                logprob=max(p[1].logprob, -9999.0),
+                            )
+                            for i, p in enumerate(step_top_logprobs.items())
+                            if num_output_top_logprobs and i < num_output_top_logprobs
+                        ],
+                    )
+                )
+
+        return ChatCompletionLogProbs(content=logprobs_content)
diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py
index b9b9b1ab30ad..3dece07748cc 100644
--- a/vllm/entrypoints/openai/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text.py
@@ -201,10 +201,10 @@ async def _create_speech_to_text(
                 self.engine_client.generate(
                     prompt,
                     sampling_params,
-                    request_id,
+                    f"{request_id}_{i}",
                     lora_request=lora_request,
                 )
-                for prompt in prompts
+                for i, prompt in enumerate(prompts)
             ]
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index e99e405f5de6..87ef2e0786a9 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -22,7 +22,7 @@
 from vllm.sampling_params import (
     StructuredOutputsParams,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path
 
@@ -36,7 +36,7 @@ class ToolParser:
     derived classes.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         self.prev_tool_call_arr: list[dict] = []
         # the index of the tool call that is currently being parsed
         self.current_tool_id: int = -1
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
index cbeb879969ec..10de3dabf985 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv31_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class DeepSeekV31ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index bf7f6fa61ab9..66b14875dce2 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class DeepSeekV3ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
diff --git a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
index 82370323cb00..d054d8e4b865 100644
--- a/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/ernie45_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Ernie45ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         """
         Ernie thinking model format:
         abc\n</think>\n\n\n<tool_call>\ndef\n</tool_call>\n
diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
index 120e63b929b1..165346adb3d9 100644
--- a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py
@@ -22,13 +22,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Glm4MoeModelToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.current_tool_name_sent = False
         self.prev_tool_call_arr: list[dict] = []
@@ -78,7 +78,7 @@ def _is_string_type(
                         .get("type", None)
                     )
                     return arg_type == "string"
-            logger.warning("No tool named '%s'.", tool_name)
+            logger.debug("No tool named '%s'.", tool_name)
             return False
 
         def _deserialize(value: str) -> Any:
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index ae9217426fb5..df1b590526b1 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -29,7 +29,7 @@
     partial_json_loads,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -44,7 +44,7 @@ class Granite20bFCToolParser(ToolParser):
     are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.bot_token = "<function_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index d29c427694dc..14b0ca0abe35 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -27,7 +27,7 @@
     partial_json_loads,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -42,7 +42,7 @@ class GraniteToolParser(ToolParser):
     are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         # for granite 3.0, the token `<|tool_call|>`
         self.bot_token = "<|tool_call|>"
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index 4336a5438109..19c1c83268ed 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -22,18 +22,18 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Hermes2ProToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
-        if isinstance(self.model_tokenizer, MistralTokenizer):
+        if isinstance(tokenizer, MistralTokenizer):
             logger.error("Detected Mistral tokenizer when using a Hermes model")
-            self.model_tokenizer = self.model_tokenizer.tokenizer
+            self.model_tokenizer = tokenizer.tokenizer
 
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
index 920675c8389b..d2419b5d84ea 100644
--- a/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hunyuan_a13b_tool_parser.py
@@ -22,14 +22,14 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import consume_space
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
 
 class HunyuanA13BToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize state for streaming mode
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 1dd327f645b3..67788358543e 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -22,13 +22,13 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Internlm2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.position = 0
 
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 6f53ddea4f0e..4655da8dd454 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -21,14 +21,13 @@
 from vllm.entrypoints.openai.tool_parsers import ToolParser
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class JambaToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         if isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
index 0453db58361a..07db52ebd5af 100644
--- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py
@@ -19,13 +19,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class KimiK2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.current_tool_name_sent: bool = False
         self.prev_tool_call_arr: list[dict] = []
@@ -34,14 +34,34 @@ def __init__(self, tokenizer: AnyTokenizer):
             str
         ] = []  # map what has been streamed for each tool so far to a list
 
+        # Section-level state management to prevent token leakage
+        self.in_tool_section: bool = False
+        self.token_buffer: str = ""
+        # Buffer size: empirical worst-case for longest marker (~30 chars) * 2
+        # + safety margin for unicode + partial overlap. Prevents unbounded growth.
+        self.buffer_max_size: int = 1024
+        self.section_char_count: int = 0  # Track characters processed in tool section
+        self.max_section_chars: int = 8192  # Force exit if section exceeds this
+        self._buffer_overflow_logged: bool = False  # Log overflow once per session
+
+        # Support both singular and plural variants
         self.tool_calls_start_token: str = "<|tool_calls_section_begin|>"
         self.tool_calls_end_token: str = "<|tool_calls_section_end|>"
+        self.tool_calls_start_token_variants: list[str] = [
+            "<|tool_calls_section_begin|>",
+            "<|tool_call_section_begin|>",  # singular variant
+        ]
+        self.tool_calls_end_token_variants: list[str] = [
+            "<|tool_calls_section_end|>",
+            "<|tool_call_section_end|>",  # singular variant
+        ]
 
         self.tool_call_start_token: str = "<|tool_call_begin|>"
         self.tool_call_end_token: str = "<|tool_call_end|>"
 
         self.tool_call_regex = re.compile(
-            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>.+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+            r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*<\|tool_call_end\|>",
+            re.DOTALL,
         )
 
         self.stream_tool_call_portion_regex = re.compile(
@@ -58,6 +78,18 @@ def __init__(self, tokenizer: AnyTokenizer):
         self.tool_calls_start_token_id = self.vocab.get(self.tool_calls_start_token)
         self.tool_calls_end_token_id = self.vocab.get(self.tool_calls_end_token)
 
+        # Get token IDs for all variants
+        self.tool_calls_start_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_start_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+        self.tool_calls_end_token_ids: list[int] = [
+            tid
+            for variant in self.tool_calls_end_token_variants
+            if (tid := self.vocab.get(variant)) is not None
+        ]
+
         self.tool_call_start_token_id = self.vocab.get(self.tool_call_start_token)
         self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token)
 
@@ -70,6 +102,51 @@ def __init__(self, tokenizer: AnyTokenizer):
                 "tokens in the tokenizer!"
             )
 
+    def _check_and_strip_markers(self, text: str) -> tuple[str, bool, bool]:
+        """
+        Check for section begin/end markers in text and strip them.
+        Returns: (cleaned_text, found_section_begin, found_section_end)
+        """
+        found_begin = False
+        found_end = False
+        cleaned = text
+
+        # Check for section begin markers (any variant)
+        for variant in self.tool_calls_start_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_begin = True
+
+        # Check for section end markers (any variant)
+        for variant in self.tool_calls_end_token_variants:
+            if variant in cleaned:
+                cleaned = cleaned.replace(variant, "")
+                found_end = True
+
+        return cleaned, found_begin, found_end
+
+    def _reset_section_state(self) -> None:
+        """Reset state when exiting tool section."""
+        self.in_tool_section = False
+        self.token_buffer = ""
+        self.section_char_count = 0
+
+    def reset_streaming_state(self) -> None:
+        """
+        Reset all streaming state. Call this between requests to prevent
+        state leakage when parser instance is reused.
+        """
+        # Reset section state
+        self._reset_section_state()
+
+        # Reset parent class state
+        self.current_tool_name_sent = False
+        self.prev_tool_call_arr = []
+        self.current_tool_id = -1
+        self.streamed_args_for_tool = []
+
+        logger.debug("Streaming state reset")
+
     def extract_tool_calls(
         self,
         model_output: str,
@@ -131,13 +208,94 @@ def extract_tool_calls_streaming(
     ) -> DeltaMessage | None:
         logger.debug("delta_text: %s", delta_text)
         logger.debug("delta_token_ids: %s", delta_token_ids)
-        # check to see if we should be streaming a tool call - is there a
-        if self.tool_calls_start_token_id not in current_token_ids:
+
+        # Flag to defer section exit until after tool parsing completes
+        deferred_section_exit = False
+
+        # Add delta to buffer for split marker detection
+        self.token_buffer += delta_text
+
+        # Enforce buffer size limit to prevent memory issues
+        if len(self.token_buffer) > self.buffer_max_size:
+            if not self._buffer_overflow_logged:
+                logger.warning(
+                    "Token buffer exceeded max size (%d bytes), flushing excess. "
+                    "This may indicate very long markers or unusual tokenization.",
+                    self.buffer_max_size,
+                )
+                self._buffer_overflow_logged = True
+            # Keep only the most recent content that might contain partial markers
+            self.token_buffer = self.token_buffer[-self.buffer_max_size // 2 :]
+
+        # Check buffer for section markers (handles split tokens)
+        buffered_text, found_section_begin, found_section_end = (
+            self._check_and_strip_markers(self.token_buffer)
+        )
+
+        # Track section state transitions
+        if found_section_begin and not self.in_tool_section:
+            logger.debug("Entering tool section")
+            self.in_tool_section = True
+            self.token_buffer = buffered_text  # Use cleaned buffer
+            self.section_char_count = 0  # Reset counter for new section
+        if found_section_end and self.in_tool_section:
+            logger.debug("Detected section end marker")
+            # CRITICAL: Don't exit early if tool_call_end is in this chunk.
+            # Tool parser must emit final arguments/close first to avoid dropping
+            # the final tool update and leaking tokens into reasoning channel.
+            has_tool_end = self.tool_call_end_token_id in delta_token_ids
+            if has_tool_end:
+                # Defer exit until after tool parsing completes
+                deferred_section_exit = True
+                logger.debug("Deferring section exit: tool_call_end in same chunk")
+                self.token_buffer = buffered_text
+            else:
+                # No tool call ending, safe to exit immediately
+                logger.debug("Exiting tool section")
+                remaining = buffered_text
+                self._reset_section_state()
+                # Return remaining text as reasoning content if non-empty
+                if remaining.strip():
+                    return DeltaMessage(content=remaining)
+                # Return empty delta to maintain function contract
+                # (always returns DeltaMessage)
+                return DeltaMessage(content="")
+        else:
+            self.token_buffer = buffered_text
+
+        # Check if any variant of section start token is in current_token_ids
+        has_section_token = any(
+            tid in current_token_ids for tid in self.tool_calls_start_token_ids
+        )
+
+        # Early return: if no section token detected yet, return as reasoning content
+        if not has_section_token and not self.in_tool_section:
             logger.debug("No tool call tokens found!")
+            # Don't clear buffer - it needs to accumulate partial markers across deltas
+            # Buffer overflow is already protected by lines 215-224
             return DeltaMessage(content=delta_text)
-        delta_text = delta_text.replace(self.tool_calls_start_token, "").replace(
-            self.tool_calls_end_token, ""
-        )
+
+        # Strip section markers from delta_text for subsequent processing
+        # NOTE: This preprocessing happens BEFORE the regex-based tool call
+        # parsing (from PR #24847) to ensure markers are removed cleanly
+        # before pattern matching. No double-stripping occurs because
+        # section markers and tool call markers are distinct.
+        delta_text, _, _ = self._check_and_strip_markers(delta_text)
+
+        # Error recovery: If in tool section for too long, force exit
+        if self.in_tool_section:
+            self.section_char_count += len(delta_text)
+            if self.section_char_count > self.max_section_chars:
+                logger.warning(
+                    "Tool section exceeded max length (%d chars), forcing exit. "
+                    "This may indicate malformed model output.",
+                    self.max_section_chars,
+                )
+                self._reset_section_state()
+                # Deferred exit already handled by forced exit above
+                # Return remaining content as reasoning (or empty delta if no content)
+                return DeltaMessage(content=delta_text if delta_text.strip() else "")
+
         try:
             # figure out where we are in the parsing by counting tool call
             # start & end tags
@@ -158,6 +316,16 @@ def extract_tool_calls_streaming(
                 and prev_tool_end_count == cur_tool_end_count
                 and self.tool_call_end_token not in delta_text
             ):
+                # CRITICAL FIX: Suppress content if in tool section but
+                # no tool calls started
+                if self.in_tool_section and cur_tool_start_count == 0:
+                    logger.debug(
+                        "In tool section but no tool calls started yet. "
+                        "Suppressing: %s",
+                        delta_text,
+                    )
+                    # Return empty delta to maintain iterator contract
+                    return DeltaMessage(content="")
                 logger.debug("Generating text content! skipping tool parsing.")
                 return DeltaMessage(content=delta_text)
 
@@ -209,6 +377,9 @@ def extract_tool_calls_streaming(
             ):
                 if self.prev_tool_call_arr is None or len(self.prev_tool_call_arr) == 0:
                     logger.debug("attempting to close tool call, but no tool call")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        self._reset_section_state()
                     return None
                 diff = self.prev_tool_call_arr[self.current_tool_id].get("arguments")
                 if diff:
@@ -218,6 +389,9 @@ def extract_tool_calls_streaming(
                         else diff
                     )
                     if '"}' not in delta_text:
+                        # Handle deferred section exit before returning
+                        if deferred_section_exit and self.in_tool_section:
+                            self._reset_section_state()
                         return None
                     end_loc = delta_text.rindex('"}')
                     diff = delta_text[:end_loc] + '"}'
@@ -227,6 +401,10 @@ def extract_tool_calls_streaming(
                         diff,
                     )
                     self.streamed_args_for_tool[self.current_tool_id] += diff
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit and self.in_tool_section:
+                        logger.debug("Completing deferred section exit")
+                        self._reset_section_state()
                     return DeltaMessage(
                         tool_calls=[
                             DeltaToolCall(
@@ -240,9 +418,19 @@ def extract_tool_calls_streaming(
 
             # case -- otherwise we're just generating text
             else:
+                # Check if we're in tool section - if so, suppress
+                if self.in_tool_section:
+                    logger.debug("In tool section, suppressing text generation")
+                    # Handle deferred section exit before returning
+                    if deferred_section_exit:
+                        self._reset_section_state()
+                    return DeltaMessage(content="")
                 text = delta_text.replace(self.tool_call_start_token, "")
                 text = text.replace(self.tool_call_end_token, "")
                 delta = DeltaMessage(tool_calls=[], content=text)
+                # Handle deferred section exit before returning
+                if deferred_section_exit and self.in_tool_section:
+                    self._reset_section_state()
                 return delta
 
             current_tool_call = dict()
@@ -390,6 +578,11 @@ def extract_tool_calls_streaming(
             else:
                 self.prev_tool_call_arr.append(current_tool_call)
 
+            # Handle deferred section exit after tool parsing completes
+            if deferred_section_exit and self.in_tool_section:
+                logger.debug("Completing deferred section exit")
+                self._reset_section_state()
+
             return delta
 
         except Exception:
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 02fc9b8a4d34..e1fe6e90dfd0 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -9,6 +9,7 @@
 from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase
 
+import vllm.envs as envs
 from vllm.entrypoints.chat_utils import make_tool_call_id
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -56,12 +57,10 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase):
         self.bot_token_id = tokenizer.encode(self.bot_token, add_special_tokens=False)[
             0
         ]
-        # Updated regex to match multiple JSONs separated by semicolons
-        # This pattern is more robust and can handle nested JSON objects
-        self.tool_call_regex = re.compile(
-            r"{[^{}]*(?:{[^{}]*}[^{}]*)*}(?:\s*;\s*{[^{}]*(?:{[^{}]*}[^{}]*)*})*",
-            re.DOTALL,
-        )
+        # Simple regex to find opening braces - we'll use JSON decoder for parsing
+        # This handles arbitrary nesting depth correctly
+        self.tool_call_start_regex = re.compile(r"\{")
+        self.json_decoder = json.JSONDecoder()
 
     def extract_tool_calls(
         self, model_output: str, request: ChatCompletionRequest
@@ -77,50 +76,85 @@ def extract_tool_calls(
                 tools_called=False, tool_calls=[], content=model_output
             )
 
-        # Find JSON object(s) in the text using regex
-        match = self.tool_call_regex.search(model_output)
-        if not match:
-            return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=model_output
-            )
+        # Keep track of the end index of the last parsed JSON object
+        # so we don't parse inner brackets
+        end_index = -1
+        tool_calls: list[ToolCall] = []
 
         try:
-            json_str = match.group(0)
-            # Split by semicolon and strip whitespace
-            json_objects = [obj.strip() for obj in json_str.split(";")]
-
-            tool_calls: list[ToolCall] = []
-            for json_obj in json_objects:
-                if not json_obj:  # Skip empty strings
+            for match in self.tool_call_start_regex.finditer(
+                model_output, timeout=envs.VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS
+            ):
+                start_index = match.start()
+                # Skip if this brace is inside a previously parsed JSON object
+                if start_index <= end_index:
                     continue
-                obj = json.loads(json_obj)
-                tool_calls.append(
-                    ToolCall(
-                        type="function",
-                        function=FunctionCall(
-                            name=obj["name"],
-                            # function call args are JSON but as a string
-                            arguments=json.dumps(
-                                obj["arguments"]
-                                if "arguments" in obj
-                                else obj["parameters"],
-                                ensure_ascii=False,
-                            ),
-                        ),
+
+                try:
+                    obj, json_end_index = self.json_decoder.raw_decode(
+                        model_output[start_index:]
+                    )
+                    end_index = start_index + json_end_index
+
+                    # raise KeyError if missing
+                    name = obj["name"]
+                    arguments_or_params = (
+                        obj["arguments"] if "arguments" in obj else obj["parameters"]
                     )
-                )
 
+                    tool_calls.append(
+                        ToolCall(
+                            type="function",
+                            function=FunctionCall(
+                                name=name,
+                                # function call args are JSON but as a string
+                                arguments=json.dumps(
+                                    arguments_or_params, ensure_ascii=False
+                                ),
+                            ),
+                        )
+                    )
+                except KeyError as e:
+                    # Missing required key
+                    missing_key = str(e).strip("'\"")
+                    logger.exception(
+                        "Couldn't extract tool call from JSON response. "
+                        "Required key '%s' not present. "
+                        "Returning output in content with empty tool calls.",
+                        missing_key,
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+                except Exception:
+                    # Any other error during parsing
+                    logger.exception(
+                        "Error in extracting tool call from response. "
+                        "Returning output in content with empty tool calls"
+                    )
+                    return ExtractedToolCallInformation(
+                        tools_called=False, tool_calls=[], content=model_output
+                    )
+        except TimeoutError:
+            logger.warning("Regex timeout occurred when matching tool call pattern.")
+            logger.debug(
+                "Regex timeout occurred when matching user input: %s", model_output
+            )
             return ExtractedToolCallInformation(
-                tools_called=True, tool_calls=tool_calls, content=None
+                tools_called=False, tool_calls=[], content=model_output
             )
 
-        except Exception:
-            logger.exception("Error in extracting tool call from response.")
-            # return information to just treat the tool call as regular JSON
+        # If we have valid tool calls, return them normally
+        if tool_calls:
             return ExtractedToolCallInformation(
-                tools_called=False, tool_calls=[], content=model_output
+                tools_called=True, tool_calls=tool_calls, content=None
             )
 
+        # No valid tool calls found
+        return ExtractedToolCallInformation(
+            tools_called=False, tool_calls=[], content=model_output
+        )
+
     def extract_tool_calls_streaming(
         self,
         previous_text: str,
diff --git a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
index c6c8ae8ae95f..76d76a4aa35a 100644
--- a/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/longcat_tool_parser.py
@@ -4,11 +4,11 @@
 import regex as re
 
 from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 class LongcatFlashToolParser(Hermes2ProToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.tool_call_start_token: str = "<longcat_tool_call>"
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
index 5c2258ba62b2..b595a98f3555 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py
@@ -21,13 +21,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class MinimaxM2ToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.prev_tool_call_arr: list[dict] = []
diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
index 982518a52e3d..1025041037c6 100644
--- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py
@@ -22,13 +22,13 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class MinimaxToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize streaming state for tracking tool call progress
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 85671271522d..7e2d67a1fb65 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -25,7 +25,7 @@
 )
 from vllm.entrypoints.openai.tool_parsers.utils import extract_intermediate_diff
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -46,7 +46,7 @@ def is_valid_id(id: str) -> bool:
         return id.isalnum() and len(id) == 9
 
 
-def _is_fn_name_regex_support(model_tokenizer: AnyTokenizer) -> bool:
+def _is_fn_name_regex_support(model_tokenizer: TokenizerLike) -> bool:
     return (
         isinstance(model_tokenizer, MistralTokenizer) and model_tokenizer.version >= 11
     )
@@ -61,7 +61,7 @@ class MistralToolParser(ToolParser):
     Used when --enable-auto-tool-choice --tool-call-parser mistral are all set
     """
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         if not isinstance(self.model_tokenizer, MistralTokenizer):
diff --git a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
index d1b36a297e0b..8bdf35d40880 100644
--- a/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/openai_tool_parser.py
@@ -18,15 +18,15 @@
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
-    AnyTokenizer = object
+    TokenizerLike = object
 
 logger = init_logger(__name__)
 
 
 class OpenAIToolParser(ToolParser):
-    def __init__(self, tokenizer: "AnyTokenizer"):
+    def __init__(self, tokenizer: "TokenizerLike"):
         super().__init__(tokenizer)
 
     def extract_tool_calls(
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
index 26261c0065ea..d49b14690ef0 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3coder_tool_parser.py
@@ -22,13 +22,13 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
 
 class Qwen3CoderToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         self.current_tool_name_sent: bool = False
@@ -128,7 +128,7 @@ def _get_arguments_config(
                     return params
                 else:
                     return {}
-        logger.warning("Tool '%s' is not defined in the tools list.", func_name)
+        logger.debug("Tool '%s' is not defined in the tools list.", func_name)
         return {}
 
     def _convert_param_value(
@@ -141,7 +141,7 @@ def _convert_param_value(
 
         if param_name not in param_config:
             if param_config != {}:
-                logger.warning(
+                logger.debug(
                     "Parsed parameter '%s' is not defined in the tool "
                     "parameters for tool '%s', directly returning the "
                     "string value.",
@@ -169,7 +169,7 @@ def _convert_param_value(
             try:
                 return int(param_value)
             except (ValueError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not an "
                     "integer in tool '%s', degenerating to string.",
                     param_value,
@@ -186,7 +186,7 @@ def _convert_param_value(
                     else int(float_param_value)
                 )
             except (ValueError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not a float "
                     "in tool '%s', degenerating to string.",
                     param_value,
@@ -197,7 +197,7 @@ def _convert_param_value(
         elif param_type in ["boolean", "bool", "binary"]:
             param_value = param_value.lower()
             if param_value not in ["true", "false"]:
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' is not a boolean "
                     "(`true` or `false`) in tool '%s', degenerating to "
                     "false.",
@@ -216,7 +216,7 @@ def _convert_param_value(
                     param_value = json.loads(param_value)
                     return param_value
                 except (json.JSONDecodeError, TypeError, ValueError):
-                    logger.warning(
+                    logger.debug(
                         "Parsed value '%s' of parameter '%s' cannot be "
                         "parsed with json.loads in tool '%s', will try "
                         "other methods to parse it.",
@@ -227,7 +227,7 @@ def _convert_param_value(
             try:
                 param_value = ast.literal_eval(param_value)  # safer
             except (ValueError, SyntaxError, TypeError):
-                logger.warning(
+                logger.debug(
                     "Parsed value '%s' of parameter '%s' cannot be "
                     "converted via Python `ast.literal_eval()` in tool "
                     "'%s', degenerating to string.",
diff --git a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
index 432c419db189..03862ff432a5 100644
--- a/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/qwen3xml_tool_parser.py
@@ -23,7 +23,7 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -1165,7 +1165,7 @@ def _reset_xml_parser_after_tool_call(self):
 
 
 class Qwen3XMLToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.parser = StreamingXMLToolCallParser()
 
diff --git a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
index 8aed7f0e9fc9..c7947faad192 100644
--- a/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/seed_oss_tool_parser.py
@@ -25,7 +25,7 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -34,7 +34,7 @@ class SeedOssToolParser(ToolParser):
     TOOL_CALL_START = "<seed:tool_call>"
     TOOL_CALL_END = "</seed:tool_call>"
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # --- streaming state ---
diff --git a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
index adcb9f476547..9213d6859dd9 100644
--- a/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/step3_tool_parser.py
@@ -21,7 +21,7 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
@@ -41,7 +41,7 @@ class Step3ToolParser(ToolParser):
     TOOL_SEP = "<｜tool_sep｜>"
     SPECIAL_TOKENS = [TOOL_CALLS_BEGIN, TOOL_CALLS_END, TOOL_CALL_BEGIN, TOOL_CALL_END]
 
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
         self.position = 0
         # Explicit state flags for robust streaming
diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
index 9d308af4de60..effd2bd08b42 100644
--- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py
@@ -21,14 +21,14 @@
     ToolParser,
 )
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils import random_uuid
 
 logger = init_logger(__name__)
 
 
 class xLAMToolParser(ToolParser):
-    def __init__(self, tokenizer: AnyTokenizer):
+    def __init__(self, tokenizer: TokenizerLike):
         super().__init__(tokenizer)
 
         # Initialize state for streaming mode
diff --git a/vllm/entrypoints/openai/utils.py b/vllm/entrypoints/openai/utils.py
new file mode 100644
index 000000000000..29db601af67f
--- /dev/null
+++ b/vllm/entrypoints/openai/utils.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TypeVar
+
+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+# Used internally
+_ChatCompletionResponseChoiceT = TypeVar(
+    "_ChatCompletionResponseChoiceT",
+    ChatCompletionResponseChoice,
+    ChatCompletionResponseStreamChoice,
+)
+
+
+def maybe_filter_parallel_tool_calls(
+    choice: _ChatCompletionResponseChoiceT, request: ChatCompletionRequest
+) -> _ChatCompletionResponseChoiceT:
+    """Filter to first tool call only when parallel_tool_calls is False."""
+
+    if request.parallel_tool_calls:
+        return choice
+
+    if isinstance(choice, ChatCompletionResponseChoice) and choice.message.tool_calls:
+        choice.message.tool_calls = choice.message.tool_calls[:1]
+    elif (
+        isinstance(choice, ChatCompletionResponseStreamChoice)
+        and choice.delta.tool_calls
+    ):
+        choice.delta.tool_calls = [
+            tool_call for tool_call in choice.delta.tool_calls if tool_call.index == 0
+        ]
+
+    return choice
+
+
+async def validate_json_request(raw_request: Request):
+    content_type = raw_request.headers.get("content-type", "").lower()
+    media_type = content_type.split(";", maxsplit=1)[0]
+    if media_type != "application/json":
+        raise RequestValidationError(
+            errors=["Unsupported Media Type: Only 'application/json' is allowed"]
+        )
diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py
new file mode 100644
index 000000000000..789fd8bd262b
--- /dev/null
+++ b/vllm/entrypoints/pooling/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from fastapi import FastAPI
+
+
+def register_pooling_api_routers(app: FastAPI):
+    from vllm.entrypoints.pooling.classify.api_router import router as classify_router
+    from vllm.entrypoints.pooling.embed.api_router import router as embed_router
+    from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router
+    from vllm.entrypoints.pooling.score.api_router import router as score_router
+
+    app.include_router(classify_router)
+    app.include_router(embed_router)
+    app.include_router(score_router)
+    app.include_router(pooling_router)
diff --git a/vllm/entrypoints/pooling/classify/__init__.py b/vllm/entrypoints/pooling/classify/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/classify/api_router.py b/vllm/entrypoints/pooling/classify/api_router.py
new file mode 100644
index 000000000000..d6ced73c88eb
--- /dev/null
+++ b/vllm/entrypoints/pooling/classify/api_router.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from starlette.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationRequest,
+    ClassificationResponse,
+)
+from vllm.entrypoints.pooling.classify.serving import ServingClassification
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def classify(request: Request) -> ServingClassification | None:
+    return request.app.state.openai_serving_classification
+
+
+@router.post("/classify", dependencies=[Depends(validate_json_request)])
+@with_cancellation
+@load_aware_call
+async def create_classify(request: ClassificationRequest, raw_request: Request):
+    handler = classify(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Classification API"
+        )
+
+    try:
+        generator = await handler.create_classify(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+
+    elif isinstance(generator, ClassificationResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py
new file mode 100644
index 000000000000..273bdd29ee58
--- /dev/null
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+
+
+class ClassificationCompletionRequest(OpenAIBaseModel):
+    model: str | None = None
+    input: list[str] | str
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:classification-extra-params]
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class ClassificationChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+    user: str | None = None
+
+    # --8<-- [start:chat-classification-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:chat-classification-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+ClassificationRequest: TypeAlias = (
+    ClassificationCompletionRequest | ClassificationChatRequest
+)
+
+
+class ClassificationData(OpenAIBaseModel):
+    index: int
+    label: str | None
+    probs: list[float]
+    num_classes: int
+
+
+class ClassificationResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"classify-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ClassificationData]
+    usage: UsageInfo
diff --git a/vllm/entrypoints/openai/serving_classification.py b/vllm/entrypoints/pooling/classify/serving.py
similarity index 56%
rename from vllm/entrypoints/openai/serving_classification.py
rename to vllm/entrypoints/pooling/classify/serving.py
index 45bbe732a680..d6d3825daf7b 100644
--- a/vllm/entrypoints/openai/serving_classification.py
+++ b/vllm/entrypoints/pooling/classify/serving.py
@@ -4,16 +4,15 @@
 from http import HTTPStatus
 from typing import cast
 
+import jinja2
 import numpy as np
 from fastapi import Request
-from typing_extensions import override
 
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    ClassificationData,
-    ClassificationRequest,
-    ClassificationResponse,
+    ChatCompletionRequest,
     ErrorResponse,
     UsageInfo,
 )
@@ -23,6 +22,13 @@
     ServeContext,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.classify.protocol import (
+    ClassificationChatRequest,
+    ClassificationCompletionRequest,
+    ClassificationData,
+    ClassificationRequest,
+    ClassificationResponse,
+)
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.logger import init_logger
 from vllm.outputs import ClassificationOutput, PoolingRequestOutput
@@ -32,7 +38,10 @@
 
 
 class ClassificationMixin(OpenAIServing):
-    @override
+    chat_template: str | None
+    chat_template_content_format: ChatTemplateContentFormatOption
+    trust_request_chat_template: bool
+
     async def _preprocess(
         self,
         ctx: ServeContext,
@@ -42,31 +51,79 @@ async def _preprocess(
         and prepare model-specific inputs.
         """
         ctx = cast(ClassificationServeContext, ctx)
-        if isinstance(ctx.request.input, str) and not ctx.request.input:
-            return self.create_error_response(
-                "Input cannot be empty for classification",
-                status_code=HTTPStatus.BAD_REQUEST,
-            )
-
-        if isinstance(ctx.request.input, list) and len(ctx.request.input) == 0:
-            return None
-
         try:
             ctx.tokenizer = await self.engine_client.get_tokenizer()
 
-            renderer = self._get_renderer(ctx.tokenizer)
-            ctx.engine_prompts = await renderer.render_prompt(
-                prompt_or_prompts=ctx.request.input,
-                config=self._build_render_config(ctx.request),
-            )
+            request_obj = ctx.request
+
+            if isinstance(request_obj, ClassificationChatRequest):
+                chat_request = request_obj
+                messages = chat_request.messages
+                trust_request_chat_template = getattr(
+                    self,
+                    "trust_request_chat_template",
+                    False,
+                )
+                ret = self._validate_chat_template(
+                    request_chat_template=chat_request.chat_template,
+                    chat_template_kwargs=chat_request.chat_template_kwargs,
+                    trust_request_chat_template=trust_request_chat_template,
+                )
+                if ret:
+                    return ret
+
+                (
+                    _,
+                    _,
+                    engine_prompts,
+                ) = await self._preprocess_chat(
+                    cast(ChatCompletionRequest, chat_request),
+                    ctx.tokenizer,
+                    messages,
+                    chat_template=(
+                        chat_request.chat_template
+                        or getattr(self, "chat_template", None)
+                    ),
+                    chat_template_content_format=cast(
+                        ChatTemplateContentFormatOption,
+                        getattr(self, "chat_template_content_format", "auto"),
+                    ),
+                    add_generation_prompt=False,
+                    continue_final_message=False,
+                    add_special_tokens=chat_request.add_special_tokens,
+                )
+                ctx.engine_prompts = engine_prompts
+
+            elif isinstance(request_obj, ClassificationCompletionRequest):
+                completion_request = request_obj
+                input_data = completion_request.input
+                if input_data in (None, ""):
+                    return self.create_error_response(
+                        "Input or messages must be provided",
+                        status_code=HTTPStatus.BAD_REQUEST,
+                    )
+                if isinstance(input_data, list) and not input_data:
+                    ctx.engine_prompts = []
+                    return None
+
+                renderer = self._get_renderer(ctx.tokenizer)
+                prompt_input = cast(str | list[str], input_data)
+                ctx.engine_prompts = await renderer.render_prompt(
+                    prompt_or_prompts=prompt_input,
+                    config=self._build_render_config(completion_request),
+                )
+            else:
+                return self.create_error_response(
+                    "Invalid classification request type",
+                    status_code=HTTPStatus.BAD_REQUEST,
+                )
 
             return None
 
-        except (ValueError, TypeError) as e:
+        except (ValueError, TypeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(str(e))
 
-    @override
     def _build_response(
         self,
         ctx: ServeContext,
@@ -118,6 +175,7 @@ def _build_render_config(self, request: ClassificationRequest) -> RenderConfig:
         return RenderConfig(
             max_length=self.max_model_len,
             truncate_prompt_tokens=request.truncate_prompt_tokens,
+            add_special_tokens=request.add_special_tokens,
         )
 
 
@@ -130,6 +188,9 @@ def __init__(
         models: OpenAIServingModels,
         *,
         request_logger: RequestLogger | None,
+        chat_template: str | None = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
+        trust_request_chat_template: bool = False,
         log_error_stack: bool = False,
     ) -> None:
         super().__init__(
@@ -139,6 +200,10 @@ def __init__(
             log_error_stack=log_error_stack,
         )
 
+        self.chat_template = chat_template
+        self.chat_template_content_format = chat_template_content_format
+        self.trust_request_chat_template = trust_request_chat_template
+
     async def create_classify(
         self,
         request: ClassificationRequest,
@@ -156,10 +221,9 @@ async def create_classify(
 
         return await super().handle(ctx)  # type: ignore
 
-    @override
     def _create_pooling_params(
         self,
-        ctx: ClassificationServeContext,
+        ctx: ServeContext[ClassificationRequest],
     ) -> PoolingParams | ErrorResponse:
         pooling_params = super()._create_pooling_params(ctx)
         if isinstance(pooling_params, ErrorResponse):
diff --git a/vllm/entrypoints/pooling/embed/__init__.py b/vllm/entrypoints/pooling/embed/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/embed/api_router.py b/vllm/entrypoints/pooling/embed/api_router.py
new file mode 100644
index 000000000000..5b10a32e79f8
--- /dev/null
+++ b/vllm/entrypoints/pooling/embed/api_router.py
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingRequest,
+    EmbeddingResponse,
+)
+from vllm.entrypoints.pooling.embed.serving import OpenAIServingEmbedding
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def embedding(request: Request) -> OpenAIServingEmbedding | None:
+    return request.app.state.openai_serving_embedding
+
+
+@router.post(
+    "/v1/embeddings",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_embedding(
+    request: EmbeddingRequest,
+    raw_request: Request,
+):
+    handler = embedding(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Embeddings API"
+        )
+
+    try:
+        generator = await handler.create_embedding(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, EmbeddingResponse):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, EmbeddingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py
new file mode 100644
index 000000000000..7eb53e14d5d8
--- /dev/null
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -0,0 +1,208 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any, TypeAlias
+
+from pydantic import (
+    Field,
+    model_validator,
+)
+
+from vllm import PoolingParams
+from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class EmbeddingCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings
+    model: str | None = None
+    input: list[int] | list[list[int]] | str | list[str]
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:embedding-extra-params]
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."
+        ),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:embedding-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+class EmbeddingChatRequest(OpenAIBaseModel):
+    model: str | None = None
+    messages: list[ChatCompletionMessageParam]
+
+    encoding_format: EncodingFormat = "float"
+    dimensions: int | None = None
+    user: str | None = None
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:chat-embedding-extra-params]
+    add_generation_prompt: bool = Field(
+        default=False,
+        description=(
+            "If true, the generation prompt will be added to the chat template. "
+            "This is a parameter used by chat template in tokenizer config of the "
+            "model."
+        ),
+    )
+
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."
+        ),
+    )
+    chat_template: str | None = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."
+        ),
+    )
+    chat_template_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Additional keyword args to pass to the template renderer. "
+            "Will be accessible by the chat template."
+        ),
+    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+    request_id: str = Field(
+        default_factory=random_uuid,
+        description=(
+            "The request_id related to this request. If the caller does "
+            "not set it, a random_uuid will be generated. This id is used "
+            "through out the inference process and return in response."
+        ),
+    )
+    normalize: bool | None = Field(
+        default=None,
+        description="Whether to normalize the embeddings outputs. Default is True.",
+    )
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    # --8<-- [end:chat-embedding-extra-params]
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_generation_prompt(cls, data):
+        if data.get("continue_final_message") and data.get("add_generation_prompt"):
+            raise ValueError(
+                "Cannot set both `continue_final_message` and "
+                "`add_generation_prompt` to True."
+            )
+        return data
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+        )
+
+
+EmbeddingRequest: TypeAlias = EmbeddingCompletionRequest | EmbeddingChatRequest
+
+
+class EmbeddingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "embedding"
+    embedding: list[float] | str
+
+
+class EmbeddingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[EmbeddingResponseData]
+    usage: UsageInfo
+
+
+class EmbeddingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/pooling/embed/serving.py
similarity index 99%
rename from vllm/entrypoints/openai/serving_embedding.py
rename to vllm/entrypoints/pooling/embed/serving.py
index 51f6106acec3..868a3cb017a6 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/pooling/embed/serving.py
@@ -13,12 +13,6 @@
 from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
-    EmbeddingBytesResponse,
-    EmbeddingChatRequest,
-    EmbeddingCompletionRequest,
-    EmbeddingRequest,
-    EmbeddingResponse,
-    EmbeddingResponseData,
     ErrorResponse,
     UsageInfo,
 )
@@ -29,6 +23,14 @@
     TextTokensPrompt,
 )
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingBytesResponse,
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+    EmbeddingRequest,
+    EmbeddingResponse,
+    EmbeddingResponseData,
+)
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.logger import init_logger
diff --git a/vllm/entrypoints/pooling/pooling/__init__.py b/vllm/entrypoints/pooling/pooling/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/pooling/api_router.py b/vllm/entrypoints/pooling/pooling/api_router.py
new file mode 100644
index 000000000000..674da94d126c
--- /dev/null
+++ b/vllm/entrypoints/pooling/pooling/api_router.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.pooling.protocol import (
+    IOProcessorResponse,
+    PoolingBytesResponse,
+    PoolingRequest,
+    PoolingResponse,
+)
+from vllm.entrypoints.pooling.pooling.serving import OpenAIServingPooling
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+
+router = APIRouter()
+
+
+def pooling(request: Request) -> OpenAIServingPooling | None:
+    return request.app.state.openai_serving_pooling
+
+
+@router.post(
+    "/pooling",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_pooling(request: PoolingRequest, raw_request: Request):
+    handler = pooling(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Pooling API"
+        )
+    try:
+        generator = await handler.create_pooling(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, (PoolingResponse, IOProcessorResponse)):
+        return JSONResponse(content=generator.model_dump())
+    elif isinstance(generator, PoolingBytesResponse):
+        return StreamingResponse(
+            content=generator.body,
+            headers={"metadata": generator.metadata},
+            media_type=generator.media_type,
+        )
+
+    assert_never(generator)
diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py
new file mode 100644
index 000000000000..364cd93738b8
--- /dev/null
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Generic, TypeAlias, TypeVar
+
+from pydantic import (
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.pooling.embed.protocol import (
+    EmbeddingChatRequest,
+    EmbeddingCompletionRequest,
+)
+from vllm.tasks import PoolingTask
+from vllm.utils import random_uuid
+from vllm.utils.serial_utils import EmbedDType, EncodingFormat, Endianness
+
+
+class PoolingCompletionRequest(EmbeddingCompletionRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+class PoolingChatRequest(EmbeddingChatRequest):
+    task: PoolingTask | None = None
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "If it is a classify or token_classify task, the default is True; "
+        "for other tasks, this value should be None.",
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            dimensions=self.dimensions,
+            normalize=self.normalize,
+            use_activation=get_use_activation(self),
+        )
+
+
+T = TypeVar("T")
+
+
+class IOProcessorRequest(OpenAIBaseModel, Generic[T]):
+    model: str | None = None
+
+    priority: int = Field(default=0)
+    """
+    The priority of the request (lower means earlier handling;
+    default: 0). Any priority other than 0 will raise an error
+    if the served model does not use priority scheduling.
+    """
+    data: T
+
+    task: PoolingTask = "plugin"
+    encoding_format: EncodingFormat = "float"
+    embed_dtype: EmbedDType = Field(
+        default="float32",
+        description=(
+            "What dtype to use for encoding. Default to using float32 for base64 "
+            "encoding to match the OpenAI python client behavior. "
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+    endianness: Endianness = Field(
+        default="native",
+        description=(
+            "What endianness to use for encoding. Default to using native for "
+            "base64 encoding to match the OpenAI python client behavior."
+            "This parameter will affect base64 and binary_response."
+        ),
+    )
+
+    def to_pooling_params(self):
+        return PoolingParams()
+
+
+class IOProcessorResponse(OpenAIBaseModel, Generic[T]):
+    request_id: str | None = None
+    """
+    The request_id associated with this response
+    """
+    created_at: int = Field(default_factory=lambda: int(time.time()))
+
+    data: T
+    """
+    When using plugins IOProcessor plugins, the actual output is generated
+    by the plugin itself. Hence, we use a generic type for the response data
+    """
+
+
+PoolingRequest: TypeAlias = (
+    PoolingCompletionRequest | PoolingChatRequest | IOProcessorRequest
+)
+
+
+class PoolingResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "pooling"
+    data: list[list[float]] | list[float] | str
+
+
+class PoolingResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"pool-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[PoolingResponseData]
+    usage: UsageInfo
+
+
+class PoolingBytesResponse(OpenAIBaseModel):
+    body: list[bytes]
+    metadata: str
+    media_type: str = "application/octet-stream"
diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/pooling/pooling/serving.py
similarity index 97%
rename from vllm/entrypoints/openai/serving_pooling.py
rename to vllm/entrypoints/pooling/pooling/serving.py
index 0eade272111f..7fb767e26d01 100644
--- a/vllm/entrypoints/openai/serving_pooling.py
+++ b/vllm/entrypoints/pooling/pooling/serving.py
@@ -4,7 +4,7 @@
 import asyncio
 import json
 import time
-from collections.abc import AsyncGenerator
+from collections.abc import AsyncGenerator, Sequence
 from typing import Final, cast
 
 import jinja2
@@ -16,6 +16,11 @@
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.pooling.protocol import (
     IOProcessorRequest,
     IOProcessorResponse,
     PoolingBytesResponse,
@@ -24,10 +29,7 @@
     PoolingRequest,
     PoolingResponse,
     PoolingResponseData,
-    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.renderer import RenderConfig
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.logger import init_logger
@@ -122,6 +124,10 @@ async def create_pooling(
                 engine_prompts = await self.io_processor.pre_process_async(
                     prompt=validated_prompt, request_id=request_id
                 )
+                if not isinstance(engine_prompts, Sequence) or isinstance(
+                    engine_prompts, (str, bytes, bytearray)
+                ):
+                    engine_prompts = [engine_prompts]
 
             elif isinstance(request, PoolingChatRequest):
                 error_check_ret = self._validate_chat_template(
diff --git a/vllm/entrypoints/pooling/score/__init__.py b/vllm/entrypoints/pooling/score/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/entrypoints/pooling/score/api_router.py b/vllm/entrypoints/pooling/score/api_router.py
new file mode 100644
index 000000000000..c7481ed9fa96
--- /dev/null
+++ b/vllm/entrypoints/pooling/score/api_router.py
@@ -0,0 +1,149 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from http import HTTPStatus
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+from typing_extensions import assert_never
+
+from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling.score.protocol import (
+    RerankRequest,
+    RerankResponse,
+    ScoreRequest,
+    ScoreResponse,
+)
+from vllm.entrypoints.pooling.score.serving import ServingScores
+from vllm.entrypoints.utils import load_aware_call, with_cancellation
+from vllm.logger import init_logger
+
+router = APIRouter()
+
+logger = init_logger(__name__)
+
+
+def score(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+def rerank(request: Request) -> ServingScores | None:
+    return request.app.state.openai_serving_scores
+
+
+@router.post(
+    "/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score(request: ScoreRequest, raw_request: Request):
+    handler = score(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Score API"
+        )
+
+    try:
+        generator = await handler.create_score(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, ScoreResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/score",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_score_v1(request: ScoreRequest, raw_request: Request):
+    logger.warning(
+        "To indicate that Score API is not part of standard OpenAI API, we "
+        "have moved it to `/score`. Please update your client accordingly."
+    )
+
+    return await create_score(request, raw_request)
+
+
+@router.post(
+    "/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def do_rerank(request: RerankRequest, raw_request: Request):
+    handler = rerank(raw_request)
+    if handler is None:
+        base_server = raw_request.app.state.openai_serving_tokenization
+        return base_server.create_error_response(
+            message="The model does not support Rerank (Score) API"
+        )
+    try:
+        generator = await handler.do_rerank(request, raw_request)
+    except Exception as e:
+        raise HTTPException(
+            status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, detail=str(e)
+        ) from e
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(
+            content=generator.model_dump(), status_code=generator.error.code
+        )
+    elif isinstance(generator, RerankResponse):
+        return JSONResponse(content=generator.model_dump())
+
+    assert_never(generator)
+
+
+@router.post(
+    "/v1/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v1(request: RerankRequest, raw_request: Request):
+    logger.warning_once(
+        "To indicate that the rerank API is not part of the standard OpenAI"
+        " API, we have located it at `/rerank`. Please update your client "
+        "accordingly. (Note: Conforms to JinaAI rerank API)"
+    )
+
+    return await do_rerank(request, raw_request)
+
+
+@router.post(
+    "/v2/rerank",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+async def do_rerank_v2(request: RerankRequest, raw_request: Request):
+    return await do_rerank(request, raw_request)
diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py
new file mode 100644
index 000000000000..a22219707c35
--- /dev/null
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from typing import Annotated, Any
+
+from pydantic import (
+    BaseModel,
+    Field,
+)
+
+from vllm import PoolingParams
+from vllm.config.pooler import get_use_activation
+from vllm.entrypoints.openai.protocol import OpenAIBaseModel, UsageInfo
+from vllm.entrypoints.score_utils import ScoreContentPartParam, ScoreMultiModalParam
+from vllm.utils import random_uuid
+
+
+class ScoreRequest(OpenAIBaseModel):
+    model: str | None = None
+    text_1: list[str] | str | ScoreMultiModalParam
+    text_2: list[str] | str | ScoreMultiModalParam
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:score-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:score-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankRequest(OpenAIBaseModel):
+    model: str | None = None
+    query: str | ScoreMultiModalParam
+    documents: list[str] | ScoreMultiModalParam
+    top_n: int = Field(default_factory=lambda: 0)
+    truncate_prompt_tokens: Annotated[int, Field(ge=-1)] | None = None
+
+    # --8<-- [start:rerank-extra-params]
+
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=("Additional kwargs to pass to the HF processor."),
+    )
+
+    priority: int = Field(
+        default=0,
+        description=(
+            "The priority of the request (lower means earlier handling; "
+            "default: 0). Any priority other than 0 will raise an error "
+            "if the served model does not use priority scheduling."
+        ),
+    )
+
+    softmax: bool | None = Field(
+        default=None,
+        description="softmax will be deprecated, please use use_activation instead.",
+    )
+
+    activation: bool | None = Field(
+        default=None,
+        description="activation will be deprecated, please use use_activation instead.",
+    )
+
+    use_activation: bool | None = Field(
+        default=None,
+        description="Whether to use activation for classification outputs. "
+        "Default is True.",
+    )
+    # --8<-- [end:rerank-extra-params]
+
+    def to_pooling_params(self):
+        return PoolingParams(
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
+            use_activation=get_use_activation(self),
+        )
+
+
+class RerankDocument(BaseModel):
+    text: str | None = None
+    multi_modal: ScoreContentPartParam | None = None
+
+
+class RerankResult(BaseModel):
+    index: int
+    document: RerankDocument
+    relevance_score: float
+
+
+class RerankUsage(BaseModel):
+    total_tokens: int
+
+
+class RerankResponse(OpenAIBaseModel):
+    id: str
+    model: str
+    usage: RerankUsage
+    results: list[RerankResult]
+
+
+class ScoreResponseData(OpenAIBaseModel):
+    index: int
+    object: str = "score"
+    score: float
+
+
+class ScoreResponse(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"embd-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: list[ScoreResponseData]
+    usage: UsageInfo
diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/pooling/score/serving.py
similarity index 98%
rename from vllm/entrypoints/openai/serving_score.py
rename to vllm/entrypoints/pooling/score/serving.py
index 9cbfc9791819..e5a66783005a 100644
--- a/vllm/entrypoints/openai/serving_score.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -11,6 +11,11 @@
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ErrorResponse,
+    UsageInfo,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.pooling.score.protocol import (
     RerankDocument,
     RerankRequest,
     RerankResponse,
@@ -19,10 +24,7 @@
     ScoreRequest,
     ScoreResponse,
     ScoreResponseData,
-    UsageInfo,
 )
-from vllm.entrypoints.openai.serving_engine import OpenAIServing
-from vllm.entrypoints.openai.serving_models import OpenAIServingModels
 from vllm.entrypoints.score_utils import (
     ScoreContentPartParam,
     ScoreMultiModalParam,
@@ -36,7 +38,7 @@
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput
-from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
 from vllm.utils.async_utils import make_async, merge_async_iterators
 
 logger = init_logger(__name__)
@@ -60,7 +62,7 @@ def __init__(
 
     async def _embedding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         texts_1: list[str],
         texts_2: list[str],
         request: RerankRequest | ScoreRequest,
@@ -153,7 +155,7 @@ async def _embedding_score(
     def _preprocess_score(
         self,
         request: RerankRequest | ScoreRequest,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         tokenization_kwargs: dict[str, Any],
         data_1: str | ScoreContentPartParam,
         data_2: str | ScoreContentPartParam,
@@ -175,7 +177,7 @@ def _preprocess_score(
 
     async def _cross_encoding_score(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         data_1: list[str] | list[ScoreContentPartParam],
         data_2: list[str] | list[ScoreContentPartParam],
         request: RerankRequest | ScoreRequest,
diff --git a/vllm/entrypoints/renderer.py b/vllm/entrypoints/renderer.py
index 3c5a396a99f9..10b90bbbb0f3 100644
--- a/vllm/entrypoints/renderer.py
+++ b/vllm/entrypoints/renderer.py
@@ -16,7 +16,7 @@
 from vllm.inputs.data import TextPrompt as EngineTextPrompt
 from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
 from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.async_utils import AsyncMicrobatchTokenizer
 
 
@@ -85,7 +85,7 @@ class BaseRenderer(ABC):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ):
         super().__init__()
         self.model_config = model_config
@@ -200,8 +200,8 @@ class CompletionRenderer(BaseRenderer):
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None = None,
-        async_tokenizer_pool: dict[AnyTokenizer, AsyncMicrobatchTokenizer]
+        tokenizer: TokenizerLike | None = None,
+        async_tokenizer_pool: dict[TokenizerLike, AsyncMicrobatchTokenizer]
         | None = None,
     ):
         super().__init__(model_config, tokenizer)
@@ -373,7 +373,7 @@ def _get_async_tokenizer(self) -> AsyncMicrobatchTokenizer:
             return async_tokenizer
 
         tokenizer = self.tokenizer
-        if self.tokenizer is None:
+        if tokenizer is None:
             raise ValueError("No tokenizer available for text input processing")
 
         if self.async_tokenizer_pool is None:
diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py
index d966f58804b6..07abb80ebc9e 100644
--- a/vllm/entrypoints/responses_utils.py
+++ b/vllm/entrypoints/responses_utils.py
@@ -9,7 +9,12 @@
 from openai.types.chat.chat_completion_message_tool_call_param import (
     Function as FunctionCallTool,
 )
-from openai.types.responses import ResponseFunctionToolCall
+from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
+from openai.types.responses.response_function_tool_call_output_item import (
+    ResponseFunctionToolCallOutputItem,
+)
+from openai.types.responses.response_output_message import ResponseOutputMessage
+from openai.types.responses.response_reasoning_item import ResponseReasoningItem
 from openai.types.responses.tool import Tool
 
 from vllm import envs
@@ -19,6 +24,49 @@
 )
 
 
+def construct_input_messages(
+    *,
+    request_instructions: str | None = None,
+    request_input: str | list[ResponseInputOutputItem],
+    prev_msg: list[ChatCompletionMessageParam] | None = None,
+    prev_response_output: list[ResponseOutputItem] | None = None,
+):
+    messages: list[ChatCompletionMessageParam] = []
+    if request_instructions:
+        messages.append(
+            {
+                "role": "system",
+                "content": request_instructions,
+            }
+        )
+
+    # Prepend the conversation history.
+    if prev_msg is not None:
+        # Add the previous messages.
+        messages.extend(prev_msg)
+    if prev_response_output is not None:
+        # Add the previous output.
+        for output_item in prev_response_output:
+            # NOTE: We skip the reasoning output.
+            if isinstance(output_item, ResponseOutputMessage):
+                for content in output_item.content:
+                    messages.append(
+                        {
+                            "role": "assistant",
+                            "content": content.text,
+                        }
+                    )
+
+    # Append the new input.
+    # Responses API supports simple text inputs without chat format.
+    if isinstance(request_input, str):
+        messages.append({"role": "user", "content": request_input})
+    else:
+        for item in request_input:
+            messages.append(construct_chat_message_with_tool_call(item))
+    return messages
+
+
 def construct_chat_message_with_tool_call(
     item: ResponseInputOutputItem,
 ) -> ChatCompletionMessageParam:
@@ -37,6 +85,24 @@ def construct_chat_message_with_tool_call(
                 )
             ],
         )
+    elif isinstance(item, ResponseReasoningItem):
+        reasoning_content = ""
+        if item.encrypted_content:
+            raise ValueError("Encrypted content is not supported.")
+        if len(item.summary) == 1:
+            reasoning_content = item.summary[0].text
+        elif item.content and len(item.content) == 1:
+            reasoning_content = item.content[0].text
+        return {
+            "role": "assistant",
+            "reasoning": reasoning_content,
+        }
+    elif isinstance(item, ResponseFunctionToolCallOutputItem):
+        return ChatCompletionToolMessageParam(
+            role="tool",
+            content=item.output,
+            tool_call_id=item.call_id,
+        )
     elif item.get("type") == "function_call_output":
         # Append the function call output as a tool message.
         return ChatCompletionToolMessageParam(
diff --git a/vllm/entrypoints/sagemaker/routes.py b/vllm/entrypoints/sagemaker/routes.py
index 498b7294f0d8..108fdd773e32 100644
--- a/vllm/entrypoints/sagemaker/routes.py
+++ b/vllm/entrypoints/sagemaker/routes.py
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import json
+from collections.abc import Awaitable, Callable
 from http import HTTPStatus
+from typing import Any
 
 import model_hosting_container_standards.sagemaker as sagemaker_standards
 import pydantic
@@ -9,12 +11,56 @@
 from fastapi.responses import JSONResponse, Response
 
 from vllm.entrypoints.openai.api_server import (
-    INVOCATION_VALIDATORS,
     base,
+    chat,
+    completion,
+    create_chat_completion,
+    create_completion,
     health,
     validate_json_request,
 )
-from vllm.entrypoints.openai.protocol import ErrorResponse
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionRequest,
+    CompletionRequest,
+    ErrorResponse,
+)
+from vllm.entrypoints.openai.serving_engine import OpenAIServing
+from vllm.entrypoints.pooling.classify.api_router import classify, create_classify
+from vllm.entrypoints.pooling.classify.protocol import ClassificationRequest
+from vllm.entrypoints.pooling.embed.api_router import create_embedding, embedding
+from vllm.entrypoints.pooling.embed.protocol import EmbeddingRequest
+from vllm.entrypoints.pooling.pooling.api_router import create_pooling, pooling
+from vllm.entrypoints.pooling.pooling.protocol import PoolingRequest
+from vllm.entrypoints.pooling.score.api_router import (
+    create_score,
+    do_rerank,
+    rerank,
+    score,
+)
+from vllm.entrypoints.pooling.score.protocol import RerankRequest, ScoreRequest
+
+# TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers
+# (requires typing_extensions >= 4.13)
+RequestType = Any
+GetHandlerFn = Callable[[Request], OpenAIServing | None]
+EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
+
+# NOTE: Items defined earlier take higher priority
+INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [
+    (ChatCompletionRequest, (chat, create_chat_completion)),
+    (CompletionRequest, (completion, create_completion)),
+    (EmbeddingRequest, (embedding, create_embedding)),
+    (ClassificationRequest, (classify, create_classify)),
+    (ScoreRequest, (score, create_score)),
+    (RerankRequest, (rerank, do_rerank)),
+    (PoolingRequest, (pooling, create_pooling)),
+]
+
+# NOTE: Construct the TypeAdapters only once
+INVOCATION_VALIDATORS = [
+    (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
+    for request_type, (get_handler, endpoint) in INVOCATION_TYPES
+]
 
 
 def register_sagemaker_routes(router: APIRouter):
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index 309a4c996392..602f59ac09f5 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -19,11 +19,7 @@
 from vllm.model_executor.models.interfaces import supports_score_template
 from vllm.multimodal.inputs import MultiModalDataDict
 from vllm.outputs import PoolingRequestOutput
-from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-)
+from vllm.transformers_utils.tokenizer import TokenizerLike
 
 ScoreContentPartParam: TypeAlias = (
     ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam
@@ -45,7 +41,7 @@ class ScoreMultiModalParam(TypedDict, total=False):
 
 
 def _cosine_similarity(
-    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
+    tokenizer: TokenizerLike,
     embed_1: list[PoolingRequestOutput],
     embed_2: list[PoolingRequestOutput],
 ) -> list[PoolingRequestOutput]:
@@ -55,8 +51,8 @@ def _cosine_similarity(
     for emb_1, emb_2 in zip(embed_1, embed_2):
         pair_score = scorer(emb_1.outputs.data, emb_2.outputs.data)
 
-        padding = []
-        if (pad_token_id := getattr(tokenizer, "pad_token_id", None)) is not None:
+        padding: list[int] = []
+        if (pad_token_id := tokenizer.pad_token_id) is not None:
             padding = [pad_token_id]
 
         tokens = emb_1.prompt_token_ids + padding + emb_2.prompt_token_ids
@@ -93,7 +89,7 @@ def parse_score_data(
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
 ) -> tuple[str, str, MultiModalDataDict | None]:
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
@@ -118,12 +114,14 @@ def _parse_score_content(
     mm_tracker: BaseMultiModalItemTracker,
 ) -> _ContentPart | None:
     if isinstance(data, str):
-        data = ChatCompletionContentPartTextParam(type="text", text=data)
+        part = ChatCompletionContentPartTextParam(type="text", text=data)
+    else:
+        part = data
 
     mm_parser = mm_tracker.create_parser()
 
     parse_res = _parse_chat_message_content_part(
-        data,
+        part,
         mm_parser,
         wrap_dicts=False,
         interleave_strings=False,
@@ -181,7 +179,7 @@ def post_process_tokens(
 
 def get_score_prompt(
     model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     tokenization_kwargs: dict[str, Any],
     data_1: str | ScoreContentPartParam,
     data_2: str | ScoreContentPartParam,
diff --git a/vllm/entrypoints/tool_server.py b/vllm/entrypoints/tool_server.py
index 0d83031ef69f..99db86702518 100644
--- a/vllm/entrypoints/tool_server.py
+++ b/vllm/entrypoints/tool_server.py
@@ -80,7 +80,9 @@ def has_tool(self, tool_name: str) -> bool:
         pass
 
     @abstractmethod
-    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
+    def get_tool_description(
+        self, tool_name: str, allowed_tools: list[str] | None = None
+    ) -> ToolNamespaceConfig | None:
         """
         Return the tool description for the given tool name.
         If the tool is not supported, return None.
@@ -147,8 +149,29 @@ async def add_tool_server(self, server_url: str):
     def has_tool(self, tool_name: str):
         return tool_name in self.harmony_tool_descriptions
 
-    def get_tool_description(self, tool_name: str):
-        return self.harmony_tool_descriptions.get(tool_name)
+    def get_tool_description(
+        self,
+        server_label: str,
+        allowed_tools: list[str] | None = None,
+    ) -> ToolNamespaceConfig | None:
+        cfg = self.harmony_tool_descriptions.get(server_label)
+        if cfg is None:
+            return None
+
+        # No restrictions: all tools from this MCP server
+        if allowed_tools is None:
+            return cfg
+
+        filtered = [t for t in cfg.tools if t.name in allowed_tools]
+
+        if not filtered:
+            return None
+
+        return ToolNamespaceConfig(
+            name=cfg.name,
+            description=cfg.description,
+            tools=filtered,
+        )
 
     @asynccontextmanager
     async def new_session(
@@ -190,7 +213,9 @@ async def init_and_validate(self):
     def has_tool(self, tool_name: str) -> bool:
         return tool_name in self.tools
 
-    def get_tool_description(self, tool_name: str) -> ToolNamespaceConfig | None:
+    def get_tool_description(
+        self, tool_name: str, allowed_tools: list[str] | None = None
+    ) -> ToolNamespaceConfig | None:
         if tool_name not in self.tools:
             return None
         if tool_name == "browser":
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 088bb679fef4..daeeb995bc74 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -30,7 +30,7 @@
 from vllm.entrypoints.openai.serving_models import LoRAModulePath
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.transformers_utils.tokenizers import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
 logger = init_logger(__name__)
diff --git a/vllm/env_override.py b/vllm/env_override.py
index 14dae2850c35..9ae1af3af46c 100644
--- a/vllm/env_override.py
+++ b/vllm/env_override.py
@@ -95,7 +95,7 @@ def get_output_names(graph_outputs) -> list[str]:
 # ===================================================
 # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to
 # fix inductor partition + attention-nvfp4 quant fusion, tested in
-# `tests/compile/test_fusions_e2e.py::test_attn_quant`.
+# `tests/compile/distributed/test_fusions_e2e.py::test_attn_quant`.
 # For more context, see https://github.com/pytorch/pytorch/pull/165815.
 
 
diff --git a/vllm/envs.py b/vllm/envs.py
index 0530938c32f9..d0912863e644 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,8 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import functools
-import hashlib
 import json
+import logging
 import os
 import sys
 import tempfile
@@ -42,6 +42,8 @@
     VLLM_LOGGING_PREFIX: str = ""
     VLLM_LOGGING_STREAM: str = "ext://sys.stdout"
     VLLM_LOGGING_CONFIG_PATH: str | None = None
+    VLLM_LOGGING_COLOR: str = "auto"
+    NO_COLOR: bool = False
     VLLM_LOG_STATS_INTERVAL: float = 10.0
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: str | None = None
@@ -50,11 +52,10 @@
     VLLM_CPU_KVCACHE_SPACE: int | None = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
     VLLM_CPU_NUM_OF_RESERVED_CPU: int | None = None
-    VLLM_CPU_MOE_PREPACK: bool = True
     VLLM_CPU_SGL_KERNEL: bool = False
     VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache")
     VLLM_XLA_CHECK_RECOMPILATION: bool = False
-    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 16 * 1024
     VLLM_ENABLE_FUSED_MOE_ACTIVATION_CHUNKING: bool = True
     VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE: Literal["auto", "nccl", "shm"] = "auto"
     VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM: bool = False
@@ -73,7 +74,7 @@
     VLLM_MEDIA_CONNECTOR: str = "http"
     VLLM_MM_INPUT_CACHE_GIB: int = 4
     VLLM_TARGET_DEVICE: str = "cuda"
-    VLLM_MAIN_CUDA_VERSION: str = "12.8"
+    VLLM_MAIN_CUDA_VERSION: str = "12.9"
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
@@ -91,10 +92,16 @@
     VLLM_TORCH_PROFILER_DIR: str | None = None
     VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False
     VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False
+    VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False
     VLLM_USE_AOT_COMPILE: bool = False
+    VLLM_USE_BYTECODE_HOOK: bool = False
     VLLM_FORCE_AOT_LOAD: bool = False
     VLLM_TORCH_PROFILER_WITH_STACK: bool = True
     VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False
+    VLLM_PROFILER_DELAY_ITERS: int = 0
+    VLLM_PROFILER_MAX_ITERS: int = 0
+    VLLM_TORCH_PROFILER_USE_GZIP: bool = True
+    VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True
     VLLM_USE_TRITON_AWQ: bool = False
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
@@ -111,7 +118,7 @@
     VLLM_ROCM_USE_AITER_TRITON_ROPE: bool = False
     VLLM_ROCM_USE_AITER_FP8BMM: bool = True
     VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: bool = False
-    VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = True
+    VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS: bool = False
     VLLM_ROCM_USE_AITER_TRITON_GEMM: bool = True
     VLLM_ROCM_USE_SKINNY_GEMM: bool = True
     VLLM_ROCM_FP8_PADDING: bool = True
@@ -140,7 +147,9 @@
     VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
     VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict"
     VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
+    VLLM_MARLIN_INPUT_DTYPE: Literal["int8", "fp8"] | None = None
     VLLM_MXFP4_USE_MARLIN: bool | None = None
+    VLLM_DEEPEPLL_NVFP4_DISPATCH: bool = False
     VLLM_V1_USE_OUTLINES_CACHE: bool = False
     VLLM_TPU_BUCKET_PADDING_GAP: int = 0
     VLLM_TPU_MOST_MODEL_LEN: int | None = None
@@ -157,7 +166,9 @@
     VLLM_USE_FLASHINFER_MOE_FP16: bool = False
     VLLM_USE_FLASHINFER_MOE_FP8: bool = False
     VLLM_USE_FLASHINFER_MOE_FP4: bool = False
-    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency"] = "latency"
+    VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
+        "latency"
+    )
     VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
     VLLM_XGRAMMAR_CACHE_MB: int = 0
     VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
@@ -224,7 +235,7 @@
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
-    VLLM_FLAT_LOGPROBS: bool = False
+    VLLM_USE_V2_MODEL_RUNNER: bool = False
 
 
 def get_default_cache_root():
@@ -422,20 +433,21 @@ def get_vllm_port() -> int | None:
         raise ValueError(f"VLLM_PORT '{port}' must be a valid integer") from err
 
 
-# The begin-* and end* here are used by the documentation generator
+# The start-* and end* here are used by the documentation generator
 # to extract the used env vars.
 
 # --8<-- [start:env-vars-definition]
 
+logger = logging.getLogger(__name__)
+
 environment_variables: dict[str, Callable[[], Any]] = {
     # ================== Installation Time Env Vars ==================
     # Target device of vLLM, supporting [cuda (by default),
     # rocm, cpu]
     "VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
-    # Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
-    # 12.8 is the default. This follows PyTorch but can be overridden.
+    # Main CUDA version of vLLM. This follows PyTorch but can be overridden.
     "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower()
-    or "12.8",
+    or "12.9",
     # Maximum number of compilation jobs to run in parallel.
     # By default this is the number of CPUs
     "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None),
@@ -556,6 +568,11 @@ def get_vllm_port() -> int | None:
     # compilation is done in warmup phase and the compilation will be
     # reused in subsequent calls.
     "VLLM_USE_AOT_COMPILE": use_aot_compile,
+    # Feature flag to enable/disable bytecode in
+    # TorchCompileWithNoGuardsWrapper.
+    "VLLM_USE_BYTECODE_HOOK": lambda: bool(
+        int(os.environ.get("VLLM_USE_BYTECODE_HOOK", "1"))
+    ),
     # Force vllm to always load AOT compiled models from disk. Failure
     # to load will result in a hard error when this is enabled.
     # Will be ignored when VLLM_USE_AOT_COMPILE is disabled.
@@ -608,6 +625,11 @@ def get_vllm_port() -> int | None:
     "VLLM_LOGGING_STREAM": lambda: os.getenv("VLLM_LOGGING_STREAM", "ext://sys.stdout"),
     # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages
     "VLLM_LOGGING_PREFIX": lambda: os.getenv("VLLM_LOGGING_PREFIX", ""),
+    # Controls colored logging output. Options: "auto" (default, colors when terminal),
+    # "1" (always use colors), "0" (never use colors)
+    "VLLM_LOGGING_COLOR": lambda: os.getenv("VLLM_LOGGING_COLOR", "auto"),
+    # Standard unix flag for disabling ANSI color codes
+    "NO_COLOR": lambda: os.getenv("NO_COLOR", "0") != "0",
     # If set, vllm will log stats at this interval in seconds
     # If not set, vllm will log stats every 10 seconds.
     "VLLM_LOG_STATS_INTERVAL": lambda: val
@@ -621,7 +643,6 @@ def get_vllm_port() -> int | None:
     # Example options:
     # - "TORCH_SDPA": use torch.nn.MultiheadAttention
     # - "FLASH_ATTN": use FlashAttention
-    # - "XFORMERS": use XFormers
     # - "FLASHINFER": use flashinfer
     # - "FLASHMLA": use FlashMLA
     # - "FLASH_ATTN_MLA": use FlashAttention for MLA
@@ -660,10 +681,6 @@ def get_vllm_port() -> int | None:
     )
     if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ
     else None,
-    # (CPU backend only) whether to use prepack for MoE layer. This will be
-    # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might
-    # need to set this to "0" (False).
-    "VLLM_CPU_MOE_PREPACK": lambda: bool(int(os.getenv("VLLM_CPU_MOE_PREPACK", "1"))),
     # (CPU backend only) whether to use SGL kernels, optimized for small batch.
     "VLLM_CPU_SGL_KERNEL": lambda: bool(int(os.getenv("VLLM_CPU_SGL_KERNEL", "0"))),
     # If the env var is set, Ray Compiled Graph uses the specified
@@ -770,7 +787,7 @@ def get_vllm_port() -> int | None:
     # Enable SPMD mode for TPU backend.
     "VLLM_XLA_USE_SPMD": lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
     "VLLM_FUSED_MOE_CHUNK_SIZE": lambda: int(
-        os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")
+        os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(16 * 1024))
     ),
     # Control whether to use fused MoE activation chunking. Current chunking
     # logic is incompatible with torch.compile and causes IMA. See issue
@@ -861,6 +878,30 @@ def get_vllm_port() -> int | None:
     "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool(
         os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0"
     ),
+    # Disable torch profiling of the AsyncLLMEngine process.
+    # If set to 1, will not profile the engine process.
+    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0"
+    ),
+    # Delay number of iterations before starting profiling when using
+    # the torch/torch CUDA profiler. If set to 0, will start profiling immediately.
+    "VLLM_PROFILER_DELAY_ITERS": lambda: int(
+        os.getenv("VLLM_PROFILER_DELAY_ITERS", "0")
+    ),
+    # Maximum number of iterations to profile when using the torch/torch CUDA profiler.
+    # If set to 0, will not limit the number of iterations.
+    "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")),
+    # Control whether torch profiler gzip-compresses profiling files.
+    # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default).
+    "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0"
+    ),
+    # Control whether torch profiler dumps the self_cuda_time_total table.
+    # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping
+    # (enabled by default).
+    "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool(
+        os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0"
+    ),
     # If set, vLLM will use Triton implementations of AWQ.
     "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))),
     # If set, allow loading or unloading lora adapters in runtime,
@@ -942,9 +983,9 @@ def get_vllm_port() -> int | None:
         in ("true", "1")
     ),
     # Whether to use aiter fusion shared experts ops.
-    # By default is enabled.
+    # By default is disabled.
     "VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS": lambda: (
-        os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "True").lower()
+        os.getenv("VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS", "False").lower()
         in ("true", "1")
     ),
     # Whether to use aiter triton kernels for gemm ops.
@@ -1095,6 +1136,16 @@ def get_vllm_port() -> int | None:
     "VLLM_MXFP4_USE_MARLIN": lambda: maybe_convert_bool(
         os.environ.get("VLLM_MXFP4_USE_MARLIN", None)
     ),
+    # The activation dtype for marlin kernel
+    "VLLM_MARLIN_INPUT_DTYPE": env_with_choices(
+        "VLLM_MARLIN_INPUT_DTYPE", None, ["int8", "fp8"]
+    ),
+    # Whether to use DeepEPLL kernels for NVFP4 quantization and dispatch method
+    # only supported on Blackwell GPUs and with
+    # https://github.com/deepseek-ai/DeepEP/pull/341
+    "VLLM_DEEPEPLL_NVFP4_DISPATCH": lambda: bool(
+        int(os.getenv("VLLM_DEEPEPLL_NVFP4_DISPATCH", "0"))
+    ),
     # Whether to turn on the outlines cache for V1
     # This cache is unbounded and on disk, so it's not safe to use in
     # an environment with potentially malicious users.
@@ -1236,7 +1287,9 @@ def get_vllm_port() -> int | None:
     # - "latency":
     #     Uses TensorRT-LLM kernels optimized for low-latency inference.
     "VLLM_FLASHINFER_MOE_BACKEND": env_with_choices(
-        "VLLM_FLASHINFER_MOE_BACKEND", "latency", ["throughput", "latency"]
+        "VLLM_FLASHINFER_MOE_BACKEND",
+        "latency",
+        ["throughput", "latency", "masked_gemm"],
     ),
     # Control the workspace buffer size for the FlashInfer backend.
     "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE": lambda: int(
@@ -1261,7 +1314,7 @@ def get_vllm_port() -> int | None:
     # MoE routing strategy selector.
     # See `RoutingSimulator.get_available_strategies()` # for available
     # strategies.
-    # Cutstom routing strategies can be registered by
+    # Custom routing strategies can be registered by
     # RoutingSimulator.register_strategy()
     # Note: custom strategies may not produce correct model outputs
     "VLLM_MOE_ROUTING_SIMULATION_STRATEGY": lambda: os.environ.get(
@@ -1493,11 +1546,10 @@ def get_vllm_port() -> int | None:
     "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
         "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
     ),
-    # Flag to enable FlatLogprobs whose GC overhead is significantly smaller than
-    # the original list[dict[int, Logprob]] approach.
-    # After enabled, PromptLogprobs and SampleLogprobs would populated as
-    # FlatLogprobs.
-    "VLLM_FLAT_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLAT_LOGPROBS", "0"))),
+    # Flag to enable v2 model runner.
+    "VLLM_USE_V2_MODEL_RUNNER": lambda: bool(
+        int(os.getenv("VLLM_USE_V2_MODEL_RUNNER", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
@@ -1545,85 +1597,90 @@ def is_set(name: str):
     raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
-def compute_hash() -> str:
-    """
-    WARNING: Whenever a new key is added to this environment
-    variables, ensure that it is included in the factors list if
-    it affects the computation graph. For example, different values
-    of VLLM_PP_LAYER_PARTITION will generate different computation
-    graphs, so it is included in the factors list. The env vars that
-    affect the choice of different kernels or attention backends should
-    also be included in the factors list.
-    """
-
-    # The values of envs may affects the computation graph.
-    # TODO(DefTruth): hash all environment variables?
-    # for key in environment_variables:
-    #     factorize(key)
-    environment_variables_to_hash = [
-        "VLLM_PP_LAYER_PARTITION",
-        "VLLM_MLA_DISABLE",
-        "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH",
-        "VLLM_USE_TRITON_AWQ",
-        "VLLM_DP_RANK",
-        "VLLM_DP_SIZE",
-        "VLLM_USE_STANDALONE_COMPILE",
-        "VLLM_FUSED_MOE_CHUNK_SIZE",
-        "VLLM_FLASHINFER_MOE_BACKEND",
-        "VLLM_V1_USE_PREFILL_DECODE_ATTENTION",
-        "VLLM_ATTENTION_BACKEND",
-        "VLLM_USE_FLASHINFER_SAMPLER",
-        "VLLM_DISABLED_KERNELS",
-        "VLLM_USE_DEEP_GEMM",
-        "VLLM_MOE_USE_DEEP_GEMM",
-        "VLLM_USE_DEEP_GEMM_E8M0",
-        "VLLM_USE_FUSED_MOE_GROUPED_TOPK",
-        "VLLM_USE_FLASHINFER_MOE_FP16",
-        "VLLM_USE_FLASHINFER_MOE_FP8",
-        "VLLM_USE_FLASHINFER_MOE_FP4",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS",
-        "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16",
-        "VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE",
-        "VLLM_USE_CUDNN_PREFILL",
-        "VLLM_USE_TRTLLM_RAGGED_DEEPSEEK_PREFILL",
-        "VLLM_USE_TRTLLM_ATTENTION",
-        "VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION",
-        "VLLM_ROCM_USE_AITER",
-        "VLLM_ROCM_USE_AITER_PAGED_ATTN",
-        "VLLM_ROCM_USE_AITER_LINEAR",
-        "VLLM_ROCM_USE_AITER_MOE",
-        "VLLM_ROCM_USE_AITER_RMSNORM",
-        "VLLM_ROCM_USE_AITER_MLA",
-        "VLLM_ROCM_USE_AITER_MHA",
-        "VLLM_ROCM_USE_AITER_FP4_ASM_GEMM",
-        "VLLM_ROCM_USE_AITER_TRITON_ROPE",
-        "VLLM_ROCM_USE_AITER_FP8BMM",
-        "VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION",
-        "VLLM_ROCM_USE_AITER_TRITON_GEMM",
-        "VLLM_ROCM_USE_SKINNY_GEMM",
-        "VLLM_ROCM_FP8_PADDING",
-        "VLLM_ROCM_MOE_PADDING",
-        "VLLM_ROCM_CUSTOM_PAGED_ATTN",
-        "VLLM_ROCM_QUICK_REDUCE_QUANTIZATION",
-        "VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16",
-        "VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB",
-        "VLLM_ROCM_FP8_MFMA_PAGE_ATTN",
-        "VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE",
-        "VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING",
-        "VLLM_NVFP4_GEMM_BACKEND",
-        "VLLM_USE_FBGEMM",
-        "VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE",
-        "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL",
-    ]
-    for key in environment_variables_to_hash:
-        # if this goes out of sync with environment_variables,
-        # it's not a user error, it's a bug
-        assert key in environment_variables, (
-            "Please update environment_variables_to_hash in envs.py"
-        )
+def compile_factors() -> dict[str, object]:
+    """Return env vars used for torch.compile cache keys.
+
+    Start with every known vLLM env var; drop entries in `ignored_factors`;
+    hash everything else. This keeps the cache key aligned across workers."""
+
+    ignored_factors: set[str] = {
+        "MAX_JOBS",
+        "VLLM_RPC_BASE_PATH",
+        "VLLM_USE_MODELSCOPE",
+        "VLLM_RINGBUFFER_WARNING_INTERVAL",
+        "VLLM_DEBUG_DUMP_PATH",
+        "VLLM_PORT",
+        "VLLM_CACHE_ROOT",
+        "LD_LIBRARY_PATH",
+        "VLLM_SERVER_DEV_MODE",
+        "VLLM_DP_MASTER_IP",
+        "VLLM_DP_MASTER_PORT",
+        "VLLM_RANDOMIZE_DP_DUMMY_INPUTS",
+        "VLLM_CI_USE_S3",
+        "VLLM_MODEL_REDIRECT_PATH",
+        "VLLM_HOST_IP",
+        "S3_ACCESS_KEY_ID",
+        "S3_SECRET_ACCESS_KEY",
+        "S3_ENDPOINT_URL",
+        "VLLM_USAGE_STATS_SERVER",
+        "VLLM_NO_USAGE_STATS",
+        "VLLM_DO_NOT_TRACK",
+        "VLLM_LOGGING_LEVEL",
+        "VLLM_LOGGING_PREFIX",
+        "VLLM_LOGGING_STREAM",
+        "VLLM_LOGGING_CONFIG_PATH",
+        "VLLM_LOGGING_COLOR",
+        "VLLM_LOG_STATS_INTERVAL",
+        "VLLM_DEBUG_LOG_API_SERVER_RESPONSE",
+        "VLLM_TUNED_CONFIG_FOLDER",
+        "VLLM_ENGINE_ITERATION_TIMEOUT_S",
+        "VLLM_HTTP_TIMEOUT_KEEP_ALIVE",
+        "VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS",
+        "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH",
+        "VLLM_SLEEP_WHEN_IDLE",
+        "VLLM_IMAGE_FETCH_TIMEOUT",
+        "VLLM_VIDEO_FETCH_TIMEOUT",
+        "VLLM_AUDIO_FETCH_TIMEOUT",
+        "VLLM_MEDIA_URL_ALLOW_REDIRECTS",
+        "VLLM_MEDIA_LOADING_THREAD_COUNT",
+        "VLLM_MAX_AUDIO_CLIP_FILESIZE_MB",
+        "VLLM_VIDEO_LOADER_BACKEND",
+        "VLLM_MEDIA_CONNECTOR",
+        "VLLM_ASSETS_CACHE",
+        "VLLM_ASSETS_CACHE_MODEL_CLEAN",
+        "VLLM_MM_INPUT_CACHE_GIB",
+        "VLLM_WORKER_MULTIPROC_METHOD",
+        "VLLM_ENABLE_V1_MULTIPROCESSING",
+        "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE",
+        "VLLM_CPU_KVCACHE_SPACE",
+        "VLLM_CPU_OMP_THREADS_BIND",
+        "VLLM_CPU_NUM_OF_RESERVED_CPU",
+        "VLLM_CPU_MOE_PREPACK",
+        "VLLM_CPU_SGL_KERNEL",
+        "VLLM_TEST_FORCE_LOAD_FORMAT",
+        "LOCAL_RANK",
+        "CUDA_VISIBLE_DEVICES",
+        "NO_COLOR",
+    }
+
+    from vllm.config.utils import normalize_value
+
+    factors: dict[str, object] = {}
+    for factor, getter in environment_variables.items():
+        if factor in ignored_factors:
+            continue
+
+        try:
+            raw = getter()
+        except Exception as exc:  # pragma: no cover - defensive logging
+            logger.warning(
+                "Skipping environment variable %s while hashing compile factors: %s",
+                factor,
+                exc,
+            )
+            continue
 
-    factors = [environment_variables[key]() for key in environment_variables_to_hash]
+        factors[factor] = normalize_value(raw)
 
     ray_noset_env_vars = [
         # Refer to
@@ -1646,8 +1703,8 @@ def compute_hash() -> str:
         "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR",
         "RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES",
     ]
-    factors.extend([os.getenv(var) for var in ray_noset_env_vars])
 
-    hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest()
+    for var in ray_noset_env_vars:
+        factors[var] = normalize_value(os.getenv(var))
 
-    return hash_str
+    return factors
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 44bc2a4cda31..173d366267e8 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -5,19 +5,17 @@
 from collections import defaultdict
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, NamedTuple
+from typing import Any, NamedTuple
 
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.ubatch_utils import UBatchSlices
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionMetadata
-
 logger = init_logger(__name__)
 
 track_batchsize: bool = envs.VLLM_LOG_BATCHSIZE_INTERVAL >= 0
@@ -35,23 +33,27 @@ class BatchDescriptor(NamedTuple):
     """
 
     num_tokens: int
-    uniform_decode: bool = False
+    num_reqs: int | None = None
+    """
+    Number of requests in the batch. Can be None for PIECEWISE cudagraphs where
+    the cudagraphs can handle any number of requests.
+    """
+    uniform: bool = False
     """
-    False can also be used for an uniform decode batch to dispatch to the 
-    cudagraph supporting non-uniform batches.
+    True if all the requests in the batch have the same number of tokens.
     """
     has_lora: bool = False
     """
     Whether this batch has active LoRA adapters.
     """
 
-    @property
-    def non_uniform(self) -> "BatchDescriptor":
+    def relax_for_mixed_batch_cudagraphs(self) -> "BatchDescriptor":
         """
-        Return a non-uniform version of current batch descriptor.
+        Return a relaxed version of current batch descriptor that is still compatible
+        with PIECEWISE cudagraphs (or mixed prefill-decode FA cudagraphs).
         """
         return BatchDescriptor(
-            self.num_tokens, uniform_decode=False, has_lora=self.has_lora
+            self.num_tokens, num_reqs=None, uniform=False, has_lora=self.has_lora
         )
 
 
@@ -153,7 +155,7 @@ def chunked_sizes(
     @contextmanager
     def sp_local_sizes(self, sequence_parallel_size: int):
         """
-        Context mamager for setting self.local_sizes. Same as self.chunked_sizes
+        Context manager for setting self.local_sizes. Same as self.chunked_sizes
         but without any chunking.
         """
         self.local_sizes = _compute_sp_num_tokens(
@@ -191,7 +193,7 @@ class ForwardContext:
     for each microbatch.
     Set dynamically for each forward pass
     """
-    attn_metadata: dict[str, "AttentionMetadata"] | list[dict[str, "AttentionMetadata"]]
+    attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     # TODO: remove after making all virtual_engines share the same kv cache
     virtual_engine: int  # set dynamically for each forward pass
     # set dynamically for each forward pass
@@ -221,6 +223,10 @@ def get_forward_context() -> ForwardContext:
     return _forward_context
 
 
+def is_forward_context_available() -> bool:
+    return _forward_context is not None
+
+
 def create_forward_context(
     attn_metadata: Any,
     vllm_config: VllmConfig,
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 80d5322a34c3..2893a56b1190 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -17,7 +17,7 @@
     MultiModalUUIDDict,
 )
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.jsontree import json_iter_leaves
 from vllm.v1.metrics.stats import MultiModalCacheStats
 
@@ -46,7 +46,7 @@ class InputPreprocessor:
     def __init__(
         self,
         model_config: ModelConfig,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         mm_processor_cache: BaseMultiModalProcessorCache | None = None,
     ) -> None:
@@ -59,10 +59,10 @@ def __init__(
 
         self.mm_cache_stats = MultiModalCacheStats() if mm_processor_cache else None
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "You cannot pass text prompts when `skip_tokenizer_init` is True"
+                "You cannot pass text prompts when `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
@@ -228,22 +228,11 @@ def _tokenize_prompt(
 
         return tokenizer.encode(prompt, **tokenization_kwargs)
 
-    def _get_mm_tokenizer(self) -> AnyTokenizer:
-        # PrithviGeoSpatialMAE needs to be initialized without a tokenizer
-        # while using also multi-modal input
-        if not self.tokenizer:
-            return cast(AnyTokenizer, object())  # Dummy
-
-        tokenizer = self.get_tokenizer()
-        return tokenizer
-
     def _get_mm_processor(self) -> BaseMultiModalProcessor:
         if not hasattr(self, "_mm_processor"):
-            tokenizer = self._get_mm_tokenizer()
-
             self._mm_processor = self.mm_registry.create_processor(
                 self.model_config,
-                tokenizer=tokenizer,
+                tokenizer=self.tokenizer,
                 cache=self.mm_processor_cache,
             )
 
@@ -348,18 +337,15 @@ def _process_tokens(
         )
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_token_ids,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             inputs = token_inputs(prompt_token_ids)
 
         if cache_salt := parsed_content.get("cache_salt"):
@@ -377,18 +363,15 @@ def _process_text(
         prompt_text = parsed_content["prompt"]
 
         inputs: TokenInputs | MultiModalInputs
-        if self.model_config.is_multimodal_model:
+        if multi_modal_data := parsed_content.get("multi_modal_data"):
             inputs = self._process_multimodal(
                 prompt_text,
-                parsed_content.get("multi_modal_data") or {},
+                multi_modal_data,
                 parsed_content.get("mm_processor_kwargs") or {},
                 tokenization_kwargs=tokenization_kwargs,
                 mm_uuids=mm_uuids,
             )
         else:
-            if parsed_content.get("multi_modal_data"):
-                raise ValueError("This model does not support multimodal inputs")
-
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 tokenization_kwargs=tokenization_kwargs,
diff --git a/vllm/logger.py b/vllm/logger.py
index 934100829684..ad3123c0f014 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -7,7 +7,8 @@
 import logging
 import os
 import sys
-from collections.abc import Hashable
+from collections.abc import Generator, Hashable
+from contextlib import contextmanager
 from functools import lru_cache, partial
 from logging import Logger
 from logging.config import dictConfig
@@ -17,18 +18,25 @@
 
 import vllm.envs as envs
 
-VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
-VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
-VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
-VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
-VLLM_LOGGING_STREAM = envs.VLLM_LOGGING_STREAM
-
 _FORMAT = (
-    f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+    f"{envs.VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
     "[%(fileinfo)s:%(lineno)d] %(message)s"
 )
 _DATE_FORMAT = "%m-%d %H:%M:%S"
 
+
+def _use_color() -> bool:
+    if envs.NO_COLOR or envs.VLLM_LOGGING_COLOR == "0":
+        return False
+    if envs.VLLM_LOGGING_COLOR == "1":
+        return True
+    if envs.VLLM_LOGGING_STREAM == "ext://sys.stdout":  # stdout
+        return hasattr(sys.stdout, "isatty") and sys.stdout.isatty()
+    elif envs.VLLM_LOGGING_STREAM == "ext://sys.stderr":  # stderr
+        return hasattr(sys.stderr, "isatty") and sys.stderr.isatty()
+    return False
+
+
 DEFAULT_LOGGING_CONFIG = {
     "formatters": {
         "vllm": {
@@ -36,13 +44,19 @@
             "datefmt": _DATE_FORMAT,
             "format": _FORMAT,
         },
+        "vllm_color": {
+            "class": "vllm.logging_utils.ColoredFormatter",
+            "datefmt": _DATE_FORMAT,
+            "format": _FORMAT,
+        },
     },
     "handlers": {
         "vllm": {
             "class": "logging.StreamHandler",
-            "formatter": "vllm",
-            "level": VLLM_LOGGING_LEVEL,
-            "stream": VLLM_LOGGING_STREAM,
+            # Choose formatter based on color setting.
+            "formatter": "vllm_color" if _use_color() else "vllm",
+            "level": envs.VLLM_LOGGING_LEVEL,
+            "stream": envs.VLLM_LOGGING_STREAM,
         },
     },
     "loggers": {
@@ -144,7 +158,7 @@ def warning_once(
 def _configure_vllm_root_logger() -> None:
     logging_config = dict[str, Any]()
 
-    if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH:
+    if not envs.VLLM_CONFIGURE_LOGGING and envs.VLLM_LOGGING_CONFIG_PATH:
         raise RuntimeError(
             "VLLM_CONFIGURE_LOGGING evaluated to false, but "
             "VLLM_LOGGING_CONFIG_PATH was given. VLLM_LOGGING_CONFIG_PATH "
@@ -152,16 +166,22 @@ def _configure_vllm_root_logger() -> None:
             "VLLM_CONFIGURE_LOGGING or unset VLLM_LOGGING_CONFIG_PATH."
         )
 
-    if VLLM_CONFIGURE_LOGGING:
+    if envs.VLLM_CONFIGURE_LOGGING:
         logging_config = DEFAULT_LOGGING_CONFIG
 
-    if VLLM_LOGGING_CONFIG_PATH:
-        if not path.exists(VLLM_LOGGING_CONFIG_PATH):
+        vllm_handler = logging_config["handlers"]["vllm"]
+        # Refresh these values in case env vars have changed.
+        vllm_handler["level"] = envs.VLLM_LOGGING_LEVEL
+        vllm_handler["stream"] = envs.VLLM_LOGGING_STREAM
+        vllm_handler["formatter"] = "vllm_color" if _use_color() else "vllm"
+
+    if envs.VLLM_LOGGING_CONFIG_PATH:
+        if not path.exists(envs.VLLM_LOGGING_CONFIG_PATH):
             raise RuntimeError(
                 "Could not load logging config. File does not exist: %s",
-                VLLM_LOGGING_CONFIG_PATH,
+                envs.VLLM_LOGGING_CONFIG_PATH,
             )
-        with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
+        with open(envs.VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file:
             custom_config = json.loads(file.read())
 
         if not isinstance(custom_config, dict):
@@ -193,6 +213,14 @@ def init_logger(name: str) -> _VllmLogger:
     return cast(_VllmLogger, logger)
 
 
+@contextmanager
+def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]:
+    current_level = logging.root.manager.disable
+    logging.disable(level)
+    yield
+    logging.disable(current_level)
+
+
 # The root logger is initialized when the module is imported.
 # This is thread-safe as the module is only imported once,
 # guaranteed by the Python GIL.
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
index 7202259ca21a..8d3354df215b 100644
--- a/vllm/logging_utils/__init__.py
+++ b/vllm/logging_utils/__init__.py
@@ -1,10 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.formatter import ColoredFormatter, NewLineFormatter
+from vllm.logging_utils.lazy import lazy
 from vllm.logging_utils.log_time import logtime
 
 __all__ = [
     "NewLineFormatter",
+    "ColoredFormatter",
+    "lazy",
     "logtime",
 ]
diff --git a/vllm/logging_utils/formatter.py b/vllm/logging_utils/formatter.py
index 02ba308e1879..3ad4ef8d119a 100644
--- a/vllm/logging_utils/formatter.py
+++ b/vllm/logging_utils/formatter.py
@@ -75,3 +75,53 @@ def shrink_path(relpath: Path) -> str:
             parts = msg.split(record.message)
             msg = msg.replace("\n", "\r\n" + parts[0])
         return msg
+
+
+class ColoredFormatter(NewLineFormatter):
+    """Adds ANSI color codes to log levels for terminal output.
+
+    This formatter adds colors by injecting them into the format string for
+    static elements (timestamp, filename, line number) and modifying the
+    levelname attribute for dynamic color selection.
+    """
+
+    # ANSI color codes
+    COLORS = {
+        "DEBUG": "\033[37m",  # White
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[35m",  # Magenta
+    }
+    GREY = "\033[90m"  # Grey for timestamp and file info
+    RESET = "\033[0m"
+
+    def __init__(self, fmt, datefmt=None, style="%"):
+        # Inject grey color codes into format string for timestamp and file info
+        if fmt:
+            # Wrap %(asctime)s with grey
+            fmt = fmt.replace("%(asctime)s", f"{self.GREY}%(asctime)s{self.RESET}")
+            # Wrap [%(fileinfo)s:%(lineno)d] with grey
+            fmt = fmt.replace(
+                "[%(fileinfo)s:%(lineno)d]",
+                f"{self.GREY}[%(fileinfo)s:%(lineno)d]{self.RESET}",
+            )
+
+        # Call parent __init__ with potentially modified format string
+        super().__init__(fmt, datefmt, style)
+
+    def format(self, record):
+        # Store original levelname to restore later (in case record is reused)
+        orig_levelname = record.levelname
+
+        # Only modify levelname - it needs dynamic color based on severity
+        if (color_code := self.COLORS.get(record.levelname)) is not None:
+            record.levelname = f"{color_code}{record.levelname}{self.RESET}"
+
+        # Call parent format which will handle everything else
+        msg = super().format(record)
+
+        # Restore original levelname
+        record.levelname = orig_levelname
+
+        return msg
diff --git a/vllm/logging_utils/lazy.py b/vllm/logging_utils/lazy.py
new file mode 100644
index 000000000000..3ade79896285
--- /dev/null
+++ b/vllm/logging_utils/lazy.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from collections.abc import Callable
+from typing import Any
+
+
+class lazy:
+    """Wrap a zero-argument callable evaluated only during log formatting."""
+
+    __slots__ = ("_factory",)
+
+    def __init__(self, factory: Callable[[], Any]) -> None:
+        self._factory = factory
+
+    def __str__(self) -> str:
+        return str(self._factory())
+
+    def __repr__(self) -> str:
+        return str(self)
diff --git a/vllm/logits_process.py b/vllm/logits_process.py
index 7b6a6528e20e..1bf97c2535fb 100644
--- a/vllm/logits_process.py
+++ b/vllm/logits_process.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 LogitsProcessor: TypeAlias = (
     Callable[[list[int], torch.Tensor], torch.Tensor]
@@ -19,7 +19,7 @@
 
 
 def get_bad_words_logits_processors(
-    bad_words: list[str], tokenizer: AnyTokenizer
+    bad_words: list[str], tokenizer: TokenizerLike
 ) -> list[LogitsProcessor]:
     bad_words_ids: list[list[int]] = list()
 
diff --git a/vllm/logprobs.py b/vllm/logprobs.py
index a34398db2c96..6a820308f523 100644
--- a/vllm/logprobs.py
+++ b/vllm/logprobs.py
@@ -5,8 +5,6 @@
 from dataclasses import dataclass, field
 from typing import overload
 
-import vllm.envs as envs
-
 
 # We use dataclass for now because it is used for
 # openai server output, and msgspec is not serializable.
@@ -161,17 +159,17 @@ def __iter__(self) -> Iterator[LogprobsOnePosition]:
 SampleLogprobs = FlatLogprobs | list[LogprobsOnePosition]
 
 
-def create_prompt_logprobs() -> PromptLogprobs:
+def create_prompt_logprobs(flat_logprobs: bool) -> PromptLogprobs:
     """Creates a container to store prompt logprobs for a request"""
-    logprobs = FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+    logprobs = FlatLogprobs() if flat_logprobs else []
     # NOTE: logprob of first prompt token is None.
     logprobs.append(None)
     return logprobs
 
 
-def create_sample_logprobs() -> SampleLogprobs:
+def create_sample_logprobs(flat_logprobs: bool) -> SampleLogprobs:
     """Creates a container to store decode logprobs for a request"""
-    return FlatLogprobs() if envs.VLLM_FLAT_LOGPROBS else []
+    return FlatLogprobs() if flat_logprobs else []
 
 
 def append_logprobs_for_next_position(
diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py
index 8a4f5ff175d4..25364a588136 100644
--- a/vllm/lora/layers/__init__.py
+++ b/vllm/lora/layers/__init__.py
@@ -11,7 +11,7 @@
     QKVParallelLinearWithLoRA,
     QKVParallelLinearWithShardedLoRA,
 )
-from vllm.lora.layers.fused_moe import FusedMoEWithLoRA
+from vllm.lora.layers.fused_moe import FusedMoE3DWithLoRA, FusedMoEWithLoRA
 from vllm.lora.layers.logits_processor import LogitsProcessorWithLoRA
 from vllm.lora.layers.replicated_linear import ReplicatedLinearWithLoRA
 from vllm.lora.layers.row_parallel_linear import (
@@ -38,4 +38,5 @@
     "ReplicatedLinearWithLoRA",
     "LoRAMapping",
     "FusedMoEWithLoRA",
+    "FusedMoE3DWithLoRA",
 ]
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 0c7e80684889..a4b8fb4d2aec 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -42,9 +42,8 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
         ...
@@ -61,7 +60,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         """Returns True if the layer can be replaced by this LoRA layer."""
         raise NotImplementedError
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index d619a0edc124..06ecc8d2f634 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -94,14 +94,15 @@ def reset_lora(self, index: int):
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         # Except for QKVParallelLinearWithLoRA and
         # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers
         # store weights in a tuple of size 1. These two layers will
         # override this function.
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         assert (
             len(self.lora_a_stacked) == len(self.lora_b_stacked) == self.n_slices == 1
         )
@@ -121,7 +122,7 @@ def set_lora(
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
-        # In transformers backend, x and output have extra batch dimension like
+        # In Transformers modeling backend, x and output have extra batch dimension like
         # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
         # therefore we need to flatten the batch dimensions.
         if x.ndim == 3 and output.ndim == 3:
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index 637ded9b2a0f..904025901fba 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -153,7 +153,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ColumnParallelLinear or (
             type(source_layer) is MergedColumnParallelLinear
@@ -246,9 +246,8 @@ def slice_lora_b(
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         self.reset_lora(index)
 
@@ -273,7 +272,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return (
             type(source_layer) is MergedColumnParallelLinear
@@ -339,7 +338,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 1
 
@@ -397,7 +396,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is QKVParallelLinear and len(packed_modules_list) == 3
 
@@ -435,7 +434,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -481,7 +480,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -517,7 +516,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
@@ -566,7 +565,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py
index 8fb3efa220f6..24cab79a7244 100644
--- a/vllm/lora/layers/fused_moe.py
+++ b/vllm/lora/layers/fused_moe.py
@@ -12,6 +12,7 @@
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from vllm.distributed.utils import divide
 from vllm.lora.layers.base import BaseLayerWithLoRA
 from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -19,15 +20,26 @@
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    modular_marlin_fused_moe,
+    MarlinExperts,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import (
-    modular_triton_fused_moe,
+    TritonExperts,
     try_get_optimal_moe_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe_modular_method import (
     FusedMoEModularMethod,
 )
+from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    UnfusedOAITritonExperts,
+)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel,
+)
+from vllm.model_executor.layers.fused_moe.prepare_finalize import (
+    MoEPrepareAndFinalizeNoEP,
+)
+
+from .utils import _get_lora_device
 
 
 class FusedMoEWithLoRA(BaseLayerWithLoRA):
@@ -40,7 +52,8 @@ def __init__(self, base_layer: FusedMoE) -> None:
         )
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
-        self.device = base_layer.w2_weight.device
+        self.device = _get_lora_device(base_layer)
+        self._w13_slices = 2
         self._inject_lora_into_fused_moe()
 
     def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None]:
@@ -59,8 +72,8 @@ def _normalize_keys(self, config: dict[str, int | None]) -> dict[str, int | None
     def _get_lora_moe_configs(
         self,
         op_prefix: str,
-        lora_a_stacked: torch.Tensor,
-        lora_b_stacked: torch.Tensor,
+        num_loras: int,
+        rank: int,
         num_slices: int,
         M: int,
         layer: FusedMoE,
@@ -68,23 +81,25 @@ def _get_lora_moe_configs(
         config_dtype: str,
     ):
         if envs.VLLM_TUNED_CONFIG_FOLDER:
+            hidden_size = layer.hidden_size
+            intermediate_size = layer.intermediate_size_per_partition
             shrink_config = get_lora_op_configs(
                 op_type=f"fused_moe_lora_{op_prefix}_shrink",
-                max_loras=lora_a_stacked.shape[0],
+                max_loras=num_loras,
                 batch=M,
-                hidden_size=lora_a_stacked.shape[-1],
-                rank=lora_a_stacked.shape[-2],
+                hidden_size=hidden_size,
+                rank=rank,
                 num_slices=num_slices,
-                moe_intermediate_size=lora_b_stacked.shape[-2],
+                moe_intermediate_size=intermediate_size,
             )
             expand_config = get_lora_op_configs(
                 op_type=f"fused_moe_lora_{op_prefix}_expand",
-                max_loras=lora_a_stacked.shape[0],
+                max_loras=num_loras,
                 batch=M,
-                hidden_size=lora_a_stacked.shape[-1],
-                rank=lora_a_stacked.shape[-2],
+                hidden_size=hidden_size,  # lora_a_stacked.shape[-1],
+                rank=rank,
                 num_slices=num_slices,
-                moe_intermediate_size=lora_b_stacked.shape[-2],
+                moe_intermediate_size=intermediate_size,  # lora_b_stacked.shape[-2],
             )
         else:  # fall back to the default config
             get_config_func = functools.partial(
@@ -108,15 +123,23 @@ def _inject_lora_into_fused_moe(self):
         self.base_layer.ensure_moe_quant_config_init()
         quant_config = self.base_layer.quant_method.moe_quant_config
 
-        m_fused_moe_fn = (
-            modular_triton_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
+        prepare_finalize = MoEPrepareAndFinalizeNoEP()
+        m_fused_moe_fn = FusedMoEModularKernel(
+            prepare_finalize,
+            self.base_layer.quant_method.select_gemm_impl(
+                prepare_finalize, self.base_layer
+            ),
+            self.base_layer.shared_experts,
+            getattr(self.base_layer, "shared_experts_stream", None),
+        )
+        if quant_config.use_mxfp4_w4a16:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
             )
-            if not quant_config.use_mxfp4_w4a16
-            else modular_marlin_fused_moe(
-                quant_config, shared_experts=self.base_layer.shared_experts
+        else:
+            assert isinstance(
+                m_fused_moe_fn.fused_experts, (MarlinExperts, TritonExperts)
             )
-        )
 
         def fwd_decorator(layer, func):
             def wrapper(*args, **kwargs):
@@ -151,12 +174,12 @@ def wrapper(*args, **kwargs):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-
+                max_lora_rank = self.w13_lora_a_stacked[0].shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w13",
-                    lora_a_stacked=self.w1_lora_a_stacked,
-                    lora_b_stacked=self.w1_lora_b_stacked,
-                    num_slices=2,
+                    num_loras=self.max_loras,
+                    rank=max_lora_rank,
+                    num_slices=self._w13_slices,
                     M=M,
                     layer=layer,
                     top_k=top_k,
@@ -164,7 +187,6 @@ def wrapper(*args, **kwargs):
                 )
 
                 # get the block size of m from customized config or default config
-                max_loras = self.w1_lora_a_stacked.shape[0]
                 (
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -174,7 +196,7 @@ def wrapper(*args, **kwargs):
                     num_tokens,
                     shrink_config["BLOCK_SIZE_M"],
                     self.base_layer.local_num_experts,
-                    max_loras,
+                    self.max_loras,
                     self.adapter_enabled,
                     expert_map,
                 )
@@ -185,17 +207,15 @@ def wrapper(*args, **kwargs):
                     num_tokens_post_padded_lora
                 )
 
-                w13_lora_a_stacked = [self.w1_lora_a_stacked, self.w3_lora_a_stacked]
-                w13_lora_b_stacked = [self.w1_lora_b_stacked, self.w3_lora_b_stacked]
-                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
-                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
-                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+                expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
+                #
 
                 self.punica_wrapper.add_lora_fused_moe(
                     input.view(-1, top_k, input.shape[-1]),
                     hidden_states,
-                    w13_lora_a_stacked,
-                    w13_lora_b_stacked,
+                    self.w13_lora_a_stacked,
+                    self.w13_lora_b_stacked,
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -205,6 +225,7 @@ def wrapper(*args, **kwargs):
                     shrink_config,  ## pass the shrink config
                     expand_config,  ## pass the expand config
                     self.adapter_enabled,
+                    fully_sharded=self.fully_sharded,
                 )
 
                 result = func(*args, **kwargs)
@@ -228,11 +249,11 @@ def wrapper(*args, **kwargs):
                 CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE
                 num_tokens = hidden_states.size(0)
                 M = min(num_tokens, CHUNK_SIZE)
-
+                max_lora_rank = self.w2_lora_a_stacked[0].shape[-2]
                 shrink_config, expand_config = self._get_lora_moe_configs(
                     op_prefix="w2",
-                    lora_a_stacked=self.w2_lora_a_stacked,
-                    lora_b_stacked=self.w2_lora_b_stacked,
+                    num_loras=self.max_loras,
+                    rank=max_lora_rank,
                     num_slices=1,
                     M=M,
                     layer=layer,
@@ -245,17 +266,19 @@ def wrapper(*args, **kwargs):
                 num_tokens_post_padded_lora = moe_state_dict[
                     "num_tokens_post_padded_lora"
                 ]
-                max_loras = self.w1_lora_a_stacked.shape[0]
-                expert_ids_lora = expert_ids_lora.view(max_loras, -1)
-                sorted_token_ids_lora = sorted_token_ids_lora.view(max_loras, -1)
+
+                expert_ids_lora = expert_ids_lora.view(self.max_loras, -1)
+                sorted_token_ids_lora = sorted_token_ids_lora.view(self.max_loras, -1)
                 intermediate_cache2 = moe_state_dict["intermediate_cache2"]
                 intermediate_cache3 = args[0]
-                max_lora_rank = self.w1_lora_a_stacked.shape[-2]
+
+                shard_size_w2 = divide(self.base_layer.hidden_size, self.tp_size)
+
                 self.punica_wrapper.add_lora_fused_moe(
                     intermediate_cache3,
                     intermediate_cache2,
-                    [self.w2_lora_a_stacked],
-                    [self.w2_lora_b_stacked],
+                    self.w2_lora_a_stacked,
+                    self.w2_lora_b_stacked,
                     topk_weights,
                     sorted_token_ids_lora,
                     expert_ids_lora,
@@ -266,6 +289,8 @@ def wrapper(*args, **kwargs):
                     expand_config,  ## pass the expand config
                     self.adapter_enabled,
                     True,
+                    fully_sharded=self.fully_sharded,
+                    offset=shard_size_w2 * self.tp_rank if self.fully_sharded else 0,
                 )
 
                 result = func(*args, **kwargs)
@@ -282,176 +307,240 @@ def wrapper(*args, **kwargs):
         fused_experts.moe_sum = moe_sum_decorator(
             self.base_layer, fused_experts.moe_sum
         )
-
         self.base_layer.quant_method = FusedMoEModularMethod(
             self.base_layer.quant_method, m_fused_moe_fn
         )
 
-    def create_lora_weights(
+    def _create_lora_a_weights(
         self,
         max_loras: int,
         lora_config: LoRAConfig,
-        model_config: PretrainedConfig | None = None,
-    ) -> None:
-        """Initializes lora matrices."""
-
-        self.adapter_enabled = torch.tensor(
-            [0] * (max_loras + 1), dtype=torch.int, device=self.device
-        )
-
-        self.w1_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
-                self.base_layer.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+    ):
+        self.w13_lora_a_stacked: tuple[torch.Tensor, ...] = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    lora_config.max_lora_rank
+                    if not self.fully_sharded
+                    else divide(lora_config.max_lora_rank, self.tp_size),
+                    self.base_layer.hidden_size,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self._w13_slices)
         )
-        self.w1_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.intermediate_size_per_partition,
-                lora_config.max_lora_rank,
+        self.w2_lora_a_stacked: tuple[torch.Tensor, ...] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    lora_config.max_lora_rank,
+                    self.base_layer.intermediate_size_per_partition,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
             ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
         )
 
-        self.w2_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
-                self.base_layer.intermediate_size_per_partition,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+    def _create_lora_b_weights(self, max_loras: int, lora_config: LoRAConfig):
+        self.w13_lora_b_stacked: tuple[torch.Tensor, ...] = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.intermediate_size_per_partition,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self._w13_slices)
         )
-        self.w2_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.hidden_size,
-                lora_config.max_lora_rank,
+        self.w2_lora_b_stacked: tuple[torch.Tensor, ...] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.hidden_size
+                    if not self.fully_sharded
+                    else divide(self.base_layer.hidden_size, self.tp_size),
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
             ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
         )
 
-        self.w3_lora_a_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                lora_config.max_lora_rank,
-                self.base_layer.hidden_size,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
-        )
-        self.w3_lora_b_stacked = torch.zeros(
-            (
-                max_loras,
-                self.base_layer.local_num_experts,
-                self.base_layer.intermediate_size_per_partition,
-                lora_config.max_lora_rank,
-            ),
-            dtype=lora_config.lora_dtype,
-            device=self.device,
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        """Initializes lora matrices."""
+        self.max_loras = lora_config.max_loras
+        self.fully_sharded = lora_config.fully_sharded_loras
+
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
         )
 
+        self._create_lora_a_weights(max_loras, lora_config)
+        self._create_lora_b_weights(max_loras, lora_config)
         # They will be used by 'LoRALayerWeights.create_dummy_lora_weights'
         # to create a dummy LoRA weights.
+        # TODO Optimize this section
         self.lora_a_stacked = []
         self.lora_b_stacked = []
         for lora_id in range(max_loras):
             for experts_id in range(self.base_layer.local_num_experts):
                 # gate_proj,down_proj,up_proj
-                self.lora_a_stacked.append(self.w1_lora_a_stacked[lora_id][experts_id])
-                self.lora_a_stacked.append(self.w2_lora_a_stacked[lora_id][experts_id])
-                self.lora_a_stacked.append(self.w3_lora_a_stacked[lora_id][experts_id])
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[0][lora_id][experts_id]
+                )
+                self.lora_a_stacked.append(
+                    self.w2_lora_a_stacked[0][lora_id][experts_id]
+                )
 
-                self.lora_b_stacked.append(self.w1_lora_b_stacked[lora_id][experts_id])
-                self.lora_b_stacked.append(self.w2_lora_b_stacked[lora_id][experts_id])
-                self.lora_b_stacked.append(self.w3_lora_b_stacked[lora_id][experts_id])
+                self.lora_b_stacked.append(
+                    self.w13_lora_b_stacked[0][lora_id][experts_id]
+                )
+                self.lora_b_stacked.append(
+                    self.w2_lora_b_stacked[0][lora_id][experts_id]
+                )
+
+                self.lora_a_stacked.append(
+                    self.w13_lora_a_stacked[1][lora_id][experts_id]
+                )
+                self.lora_b_stacked.append(
+                    self.w13_lora_b_stacked[1][lora_id][experts_id]
+                )
+
+    def _slice_w13_a(self, w13_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w13_lora_a
+
+        # w13_lora_a shape (num_experts,rank,input_size)
+        current_lora_rank = w13_lora_a.shape[1]
+        assert current_lora_rank % self.tp_size == 0
+        # Based on S-LoRA, we slice W13/W1/W3 A along the rank dim.
+        sliced_rank = current_lora_rank // self.tp_size
+        start_idx = self.tp_rank * sliced_rank
+        end_idx = (self.tp_rank + 1) * sliced_rank
+        return w13_lora_a[:, start_idx:end_idx, :]
+
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
+        if self.tp_size == 1:
+            return w13_lora_b
+
+        # w13_lora_b shape (num_experts,output_size,rank)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w13_lora_b[:, start_idx:end_idx, :]
+
+    def _slice_w2_a(self, w2_lora_a: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1:
+            return w2_lora_a
+        # w2_lora_a shape (num_experts,rank,input_size)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+
+        return w2_lora_a[:, :, start_idx:end_idx]
+
+    def _slice_w2_b(self, w2_lora_b: torch.Tensor) -> torch.Tensor:
+        """
+        Applies to FusedMoEWithLoRA and FusedMoE3DWithLoRA
+        """
+        if self.tp_size == 1 or not self.fully_sharded:
+            return w2_lora_b
+        # Based on S-LoRA, we slice W2 B along the hidden_size dim.
+        # w2_lora_b shape (num_experts,output_size,rank)
+        current_lora_size = w2_lora_b.shape[1]
+
+        sliced_size = current_lora_size // self.tp_size
+        start_idx = self.tp_rank * sliced_size
+        end_idx = (self.tp_rank + 1) * sliced_size
+        return w2_lora_b[:, start_idx:end_idx, :]
 
     def reset_lora(self, index: int):
         """Resets the lora weights at index back to 0."""
-        self.w1_lora_a_stacked[index] = 0
-        self.w1_lora_b_stacked[index] = 0
-        self.w3_lora_a_stacked[index] = 0
-        self.w3_lora_b_stacked[index] = 0
-        self.w2_lora_a_stacked[index] = 0
-        self.w2_lora_b_stacked[index] = 0
+        for pos in range(self._w13_slices):
+            self.w13_lora_a_stacked[pos][index] = 0
+            self.w13_lora_b_stacked[pos][index] = 0
+
+        self.w2_lora_a_stacked[0][index] = 0
+        self.w2_lora_b_stacked[0][index] = 0
         self.adapter_enabled[index] = 0
 
+    #
+
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
-        bias: torch.Tensor | None = None,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
         """Overwrites lora tensors at index."""
+        # Make mypy happy
+        assert isinstance(lora_a, list)
+        assert isinstance(lora_b, list)
+
         self.reset_lora(index)
         self.adapter_enabled[index] = 1
-        for eid in range(len(lora_a) // 3):
-            w1_lora_a = lora_a[eid * 3]
-            w2_lora_a = lora_a[eid * 3 + 1]
-            w3_lora_a = lora_a[eid * 3 + 2]
-            w1_lora_b = lora_b[eid * 3]
-            w2_lora_b = lora_b[eid * 3 + 1]
-            w3_lora_b = lora_b[eid * 3 + 2]
-
-            # Handle the case of adding LoRA to only a subset of experts
-            if w1_lora_a is None or w2_lora_a is None or w3_lora_a is None:
-                continue
-
-            if self.tp_size > 1:
-                shard_size = self.base_layer.intermediate_size_per_partition
-                start_idx = self.tp_rank * shard_size
-                end_idx = (self.tp_rank + 1) * shard_size
-
-                w1_lora_b = w1_lora_b[start_idx:end_idx, :]
-                w3_lora_b = w3_lora_b[start_idx:end_idx, :]
-                w2_lora_a = w2_lora_a[:, start_idx:end_idx]
-
-            self.w1_lora_a_stacked[
-                index, eid, : w1_lora_a.shape[0], : w1_lora_a.shape[1]
-            ].copy_(w1_lora_a, non_blocking=True)
-
-            self.w3_lora_a_stacked[
-                index, eid, : w3_lora_a.shape[0], : w3_lora_a.shape[1]
-            ].copy_(w3_lora_a, non_blocking=True)
-
-            self.w2_lora_b_stacked[
-                index, eid, : w2_lora_b.shape[0], : w2_lora_b.shape[1]
-            ].copy_(w2_lora_b, non_blocking=True)
-
-            self.w1_lora_b_stacked[
-                index, eid, : w1_lora_b.shape[0], : w1_lora_b.shape[1]
-            ].copy_(w1_lora_b, non_blocking=True)
-            self.w3_lora_b_stacked[
-                index, eid, : w3_lora_b.shape[0], : w3_lora_b.shape[1]
-            ].copy_(w3_lora_b, non_blocking=True)
-            self.w2_lora_a_stacked[
-                index, eid, : w2_lora_a.shape[0], : w2_lora_a.shape[1]
-            ].copy_(w2_lora_a, non_blocking=True)
 
-    @classmethod
-    def can_replace_layer(
-        cls,
-        source_layer: nn.Module,
-        lora_config: LoRAConfig,
-        packed_modules_list: list,
-        model_config: PretrainedConfig | None,
-    ) -> bool:
-        """Returns True if the layer can be replaced by this LoRA layer."""
-        # return type(source_layer) is FusedMoE
-        return isinstance(source_layer, FusedMoE)
+        num_experts = self.w13_lora_a_stacked[0].shape[1]
+
+        w1_lora_a, w2_lora_a, w3_lora_a = lora_a
+        w1_lora_b, w2_lora_b, w3_lora_b = lora_b
+        assert (
+            num_experts
+            == w1_lora_a.shape[0]
+            == w2_lora_a.shape[0]
+            == w3_lora_a.shape[0]
+        )
+
+        slliced_w1_lora_a = self._slice_w13_a(w1_lora_a)
+        slliced_w1_lora_b = self._slice_w13_b(w1_lora_b)
+        slliced_w3_lora_a = self._slice_w13_a(w3_lora_a)
+        slliced_w3_lora_b = self._slice_w13_b(w3_lora_b)
+
+        sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
+        sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
+
+        self.w13_lora_a_stacked[0][
+            index, :, : slliced_w1_lora_a.shape[1], : slliced_w1_lora_a.shape[2]
+        ].copy_(slliced_w1_lora_a, non_blocking=True)
+
+        self.w13_lora_a_stacked[1][
+            index, :, : slliced_w3_lora_a.shape[1], : slliced_w3_lora_a.shape[2]
+        ].copy_(slliced_w3_lora_a, non_blocking=True)
+
+        self.w13_lora_b_stacked[0][
+            index, :, : slliced_w1_lora_b.shape[1], : slliced_w1_lora_b.shape[2]
+        ].copy_(slliced_w1_lora_b, non_blocking=True)
+
+        self.w13_lora_b_stacked[1][
+            index, :, : slliced_w3_lora_b.shape[1], : slliced_w3_lora_b.shape[2]
+        ].copy_(slliced_w3_lora_b, non_blocking=True)
+
+        self.w2_lora_a_stacked[0][
+            index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
+        ].copy_(sliced_w2_lora_a, non_blocking=True)
+
+        self.w2_lora_b_stacked[0][
+            index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
+        ].copy_(sliced_w2_lora_b, non_blocking=True)
 
     def forward(self, *args, **kwargs):
         return self.base_layer.forward(*args, **kwargs)
@@ -470,3 +559,189 @@ def quant_method(self):
     @property
     def is_internal_router(self) -> bool:
         return self.base_layer.is_internal_router
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None = None,
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 2
+
+
+class FusedMoE3DWithLoRA(FusedMoEWithLoRA):
+    def __init__(self, base_layer):
+        super().__init__(base_layer)
+        self._w13_slices = 1
+
+    def _create_lora_b_weights(self, max_loras, lora_config):
+        self.w13_lora_b_stacked: tuple[torch.Tensor] = tuple(
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.intermediate_size_per_partition * 2,
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+            for _ in range(self._w13_slices)
+        )
+        self.w2_lora_b_stacked: tuple[torch.Tensor] = (
+            torch.zeros(
+                (
+                    max_loras,
+                    self.base_layer.local_num_experts,
+                    self.base_layer.hidden_size
+                    if not self.fully_sharded
+                    else divide(self.base_layer.hidden_size, self.tp_size),
+                    lora_config.max_lora_rank,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            ),
+        )
+
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        """Initializes lora matrices."""
+
+        assert isinstance(model_config, PretrainedConfig)
+        self._base_model = model_config.architectures[0]
+        self.max_loras = lora_config.max_loras
+        self.fully_sharded = lora_config.fully_sharded_loras
+
+        self.adapter_enabled = torch.tensor(
+            [0] * (max_loras + 1), dtype=torch.int, device=self.device
+        )
+
+        self._create_lora_a_weights(max_loras, lora_config)
+        self._create_lora_b_weights(max_loras, lora_config)
+
+    def _slice_w13_b(self, w13_lora_b: torch.Tensor):
+        if self.tp_size == 1:
+            return w13_lora_b
+
+        # w13_lora_b shape (num_experts,output_size,rank)
+        shard_size = self.base_layer.intermediate_size_per_partition
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        # HACK: Currently, only GPT-OSS is in interleaved order
+        if self._base_model == "GptOssForCausalLM":
+            # For models like GPT-OSS, the weights of w1 (gate_proj) and w3 (up_proj)
+            # in the interleaved order, and corresponding LoRA need to be processed.
+            w1_lora_b = w13_lora_b[:, ::2, :]
+            w3_lora_b = w13_lora_b[:, 1::2, :]
+            sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :]
+            sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :]
+
+            return torch.stack([sliced_w1_lora_b, sliced_w3_lora_b], dim=2).flatten(
+                1, 2
+            )
+        else:
+            slice_size = w13_lora_b.shape[1] // 2
+            w1_lora_b = w13_lora_b[:, :slice_size, :]
+            w3_lora_b = w13_lora_b[:, slice_size:, :]
+            sliced_w1_lora_b = w1_lora_b[:, start_idx:end_idx, :]
+            sliced_w3_lora_b = w3_lora_b[:, start_idx:end_idx, :]
+
+            return torch.cat([sliced_w1_lora_b, sliced_w3_lora_b], dim=1)
+
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
+    ):
+        """Overwrites lora tensors at index."""
+        # Make mypy happy
+        assert isinstance(lora_a, list)
+        assert isinstance(lora_b, list)
+        assert len(lora_a) == len(lora_b) == 2
+
+        self.reset_lora(index)
+        self.adapter_enabled[index] = 1
+
+        num_experts = self.w13_lora_a_stacked[0].shape[1]
+        w13_lora_a, w2_lora_a = lora_a
+        w13_lora_b, w2_lora_b = lora_b
+
+        # (num_experts,rank,input_size)
+        w13_lora_a = w13_lora_a.reshape(num_experts, -1, w13_lora_a.shape[-1])
+        w2_lora_a = w2_lora_a.reshape(num_experts, -1, w2_lora_a.shape[-1])
+        # (output_size,num_experts,rank)
+        w13_lora_b = w13_lora_b.reshape(w13_lora_b.shape[0], num_experts, -1)
+        w2_lora_b = w2_lora_b.reshape(w2_lora_b.shape[0], num_experts, -1)
+        # (num_experts,output_size,rank)
+        w13_lora_b = w13_lora_b.permute(1, 0, 2)
+        w2_lora_b = w2_lora_b.permute(1, 0, 2)
+
+        sliced_w13_lora_a = self._slice_w13_a(w13_lora_a)
+        sliced_w13_lora_b = self._slice_w13_b(w13_lora_b)
+
+        sliced_w2_lora_a = self._slice_w2_a(w2_lora_a)
+        sliced_w2_lora_b = self._slice_w2_b(w2_lora_b)
+
+        self.w13_lora_a_stacked[0][
+            index, :, : sliced_w13_lora_a.shape[1], : sliced_w13_lora_a.shape[2]
+        ].copy_(sliced_w13_lora_a, non_blocking=True)
+        self.w2_lora_a_stacked[0][
+            index, :, : sliced_w2_lora_a.shape[1], : sliced_w2_lora_a.shape[2]
+        ].copy_(sliced_w2_lora_a, non_blocking=True)
+
+        self.w13_lora_b_stacked[0][
+            index, :, : sliced_w13_lora_b.shape[1], : sliced_w13_lora_b.shape[2]
+        ].copy_(sliced_w13_lora_b, non_blocking=True)
+        self.w2_lora_b_stacked[0][
+            index, :, : sliced_w2_lora_b.shape[1], : sliced_w2_lora_b.shape[2]
+        ].copy_(sliced_w2_lora_b, non_blocking=True)
+
+    @property
+    def w13_input_size(self):
+        """
+        Full size
+        """
+        return self.w13_lora_a_stacked[0].shape[-1]
+
+    @property
+    def w13_output_size(self):
+        """
+        Full size
+        """
+        return self.w13_lora_b_stacked[0].shape[-2] * self.tp_size
+
+    @property
+    def w2_input_size(self):
+        """
+        Full size
+        """
+        return self.w2_lora_a_stacked[0].shape[-1] * self.tp_size
+
+    @property
+    def w2_output_size(self):
+        """
+        Full size
+        """
+        return self.w2_lora_a_stacked[0].shape[-2]
+
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None = None,
+    ) -> bool:
+        """Returns True if the layer can be replaced by this LoRA layer."""
+        # source_layer is FusedMoE or SharedFusedMoE
+        return isinstance(source_layer, FusedMoE) and len(packed_modules_list) == 1
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index adc5e861f57f..01515f613637 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import math
 
 import torch
 import torch.nn as nn
@@ -108,22 +107,13 @@ def create_lora_weights(
             (
                 max_loras,
                 1,
-                # Pad for kernel compatibility
-                math.ceil(
-                    self.base_layer.vocab_size / lora_config.lora_vocab_padding_size
-                )
-                * lora_config.lora_vocab_padding_size,
+                self.base_layer.vocab_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
-        self.embeddings_tensors = torch.full(
-            (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
-            fill_value=float("-inf"),
-            dtype=self.dtype,
-            device=self.device,
-        )
+
         if self.sharded_to_full_mapping is not None:
             self.sharded_to_full_mapping_gpu = torch.tensor(
                 self.sharded_to_full_mapping, device=self.device, dtype=torch.long
@@ -134,15 +124,15 @@ def create_lora_weights(
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = float("-inf")
 
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         self.reset_lora(index)
         self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
             lora_a, non_blocking=True
@@ -150,12 +140,6 @@ def set_lora(
         self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
             lora_b, non_blocking=True
         )
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                : embeddings_tensor.shape[0],
-                : embeddings_tensor.shape[1],
-            ] = embeddings_tensor
 
     def _get_logits(
         self,
@@ -193,39 +177,6 @@ def _get_logits(
             # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
             logits = logits[:, self.sharded_to_full_mapping_gpu]
 
-        lora_logits = torch.empty(
-            self.embeddings_tensors.shape[0] + 1,
-            self.embeddings_tensors.shape[1],
-            hidden_states.shape[0],
-            dtype=self.embeddings_tensors.dtype,
-            device=self.embeddings_tensors.device,
-        )
-        torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1])
-
-        neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype)
-
-        lora_logits[-1] = neg_inf
-        lora_logits = lora_logits.mT
-        indices_padded = self.punica_wrapper.sampler_indices_padded
-
-        if current_platform.is_tpu() or current_platform.is_xpu():
-            indices_padded = indices_padded[: logits.size(0)]
-
-        lora_logits = (
-            lora_logits.reshape(
-                lora_logits.shape[0] * lora_logits.shape[1],
-                lora_logits.shape[2],
-            )
-            .index_select(0, indices_padded)
-            .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf)
-        )
-
-        logits[
-            :,
-            self.base_layer.org_vocab_size : self.base_layer.org_vocab_size
-            + lora_logits.shape[1],
-        ] = lora_logits
-
         lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits(
             logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0
         )
@@ -246,7 +197,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # Special handling for the LogitsProcessor.
         return False
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
index 243736c4ebc6..62bac546ccd1 100644
--- a/vllm/lora/layers/replicated_linear.py
+++ b/vllm/lora/layers/replicated_linear.py
@@ -53,7 +53,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is ReplicatedLinear
 
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index 2ef1bd98fc61..958aa6af3674 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -63,23 +63,18 @@ def forward(
             input_parallel = splitted_input[self.tp_rank].contiguous()
 
         # Matrix multiply.
-        output_parallel = self.apply(input_parallel)
+        bias_ = (
+            None
+            if (self.tp_rank > 0 or self.base_layer.skip_bias_add)
+            else self.base_layer.bias
+        )
+        output_parallel = self.apply(input_parallel, bias_)
         if self.base_layer.reduce_results and self.tp_size > 1:
-            output_ = tensor_model_parallel_all_reduce(output_parallel)
-        else:
-            output_ = output_parallel
-
-        if not self.base_layer.skip_bias_add:
-            output = (
-                output_ + self.base_layer.bias
-                if self.base_layer.bias is not None
-                else output_
-            )
-            output_bias = None
+            output = tensor_model_parallel_all_reduce(output_parallel)
         else:
-            output = output_
-            output_bias = self.base_layer.bias
+            output = output_parallel
 
+        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
         if not self.base_layer.return_bias:
             return output
 
@@ -92,7 +87,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is RowParallelLinear
 
@@ -120,7 +115,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         return lora_b
 
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
-        output = self.base_layer.quant_method.apply(self.base_layer, x)
+        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
         x = x.view(-1, x.shape[-1])
         output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
@@ -169,7 +164,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         # specifying kwargs so they can be easily accessed in decorator
         return super().can_replace_layer(
diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py
index 2da90f180ee7..74403240f6cc 100644
--- a/vllm/lora/layers/utils.py
+++ b/vllm/lora/layers/utils.py
@@ -33,6 +33,15 @@ def _get_lora_device(base_layer: nn.Module) -> torch.device:
     # HQQ marlin
     elif hasattr(base_layer, "W_q"):
         return base_layer.W_q.device
+    # MoE layer
+    elif hasattr(base_layer, "w2_weight"):
+        return base_layer.w2_weight.device
+    # MoE Compressed Tensor
+    elif hasattr(base_layer, "w2_weight_packed"):
+        return base_layer.w2_weight_packed.device
+    # MoE GPTQ/AWQ/GGUF
+    elif hasattr(base_layer, "w2_qweight"):
+        return base_layer.w2_qweight.device
     else:
         raise ValueError(f"Unsupported base layer: {base_layer}")
 
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index ca4ad8012e9c..4c1550d09e5e 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -46,19 +46,10 @@ def create_lora_weights(
             self.embeddings_slice = None
             self.embeddings_weights = None
 
-        self.embeddings_tensors = torch.zeros(
-            (
-                max_loras,
-                lora_config.lora_extra_vocab_size,
-                self.base_layer.embedding_dim,
-            ),
-            dtype=self.base_layer.weight.dtype,
-            device=self.base_layer.weight.device,
-        )
         self.lora_a_stacked = torch.zeros(
             (
                 max_loras,
-                self.base_layer.org_vocab_size + lora_config.lora_extra_vocab_size,
+                self.base_layer.org_vocab_size,
                 lora_config.max_lora_rank,
             ),
             dtype=lora_config.lora_dtype,
@@ -82,54 +73,37 @@ def create_lora_weights(
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
-        self.embeddings_tensors[index] = 0
 
     def set_lora(
         self,
         index: int,
-        lora_a: torch.Tensor,
-        lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None,
+        lora_a: torch.Tensor | list[torch.Tensor],
+        lora_b: torch.Tensor | list[torch.Tensor],
     ):
+        assert isinstance(lora_a, torch.Tensor)
+        assert isinstance(lora_b, torch.Tensor)
         self.reset_lora(index)
         # NOTE self.lora_a_stacked is row-major, and lora_a is col-major,
         # so we need transpose here
+
         self.lora_a_stacked[index, : lora_a.shape[1], : lora_a.shape[0]].copy_(
             lora_a.T, non_blocking=True
         )
         self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
             lora_b, non_blocking=True
         )
-        if embeddings_tensor is not None:
-            self.embeddings_tensors[
-                index,
-                : embeddings_tensor.shape[0],
-                : embeddings_tensor.shape[1],
-            ].copy_(embeddings_tensor, non_blocking=True)
-            if self.embeddings_slice is not None:
-                # TODO(yard1): Optimize this copy, we don't need to copy
-                # everything, just the modified part
-                embeddings = self.embeddings_tensors.view(
-                    self.embeddings_tensors.shape[0] * self.embeddings_tensors.shape[1],
-                    self.embeddings_tensors.shape[2],
-                )[self.embeddings_slice[0] : self.embeddings_slice[1]]
-                assert self.embeddings_weights is not None
-                self.embeddings_weights[: embeddings.shape[0]].copy_(embeddings)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        added_tokens_mask = torch.where(x > self.base_layer.org_vocab_size - 1, 1, 0)
-
         # NB: Don't use torch.narrow here. torch.narrow triggers some
         # Dynamic Shape specialization in torch.compile
         num_tokens = x.shape[0]
         indices_1 = self.punica_wrapper._embeddings_indices[1][:num_tokens]
-        indices_0 = self.punica_wrapper._embeddings_indices[0][:num_tokens]
 
         full_lora_a_embeddings = F.embedding(
             x + indices_1,
             self.lora_a_stacked_2d,
         )
-        full_output = self.base_layer.forward(x + (indices_0 * added_tokens_mask))
+        full_output = self.base_layer.forward(x)
 
         full_output_org = full_output
         if full_output.ndim == 3:
@@ -157,7 +131,7 @@ def can_replace_layer(
         source_layer: nn.Module,
         lora_config: LoRAConfig,
         packed_modules_list: list,
-        model_config: PretrainedConfig | None,
+        model_config: PretrainedConfig | None = None,
     ) -> bool:
         return type(source_layer) is VocabParallelEmbedding
 
diff --git a/vllm/lora/lora_weights.py b/vllm/lora/lora_weights.py
index 7691481d5039..15c4a1be63ee 100644
--- a/vllm/lora/lora_weights.py
+++ b/vllm/lora/lora_weights.py
@@ -21,7 +21,6 @@ def __init__(
         lora_alpha: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
-        embeddings_tensor: torch.Tensor | None = None,
         scaling: float | None = None,
     ) -> None:
         self.module_name = module_name
@@ -29,7 +28,6 @@ def __init__(
         self.lora_alpha = lora_alpha
         self.lora_a = lora_a
         self.lora_b = lora_b
-        self.embeddings_tensor = embeddings_tensor
 
         if scaling is None:
             self.scaling = self.lora_alpha / self.rank
@@ -56,18 +54,11 @@ def output_dim(self) -> int:
     def is_packed(self) -> bool:
         return False
 
-    @property
-    def extra_vocab_size(self) -> int:
-        return (
-            self.embeddings_tensor.shape[0] if self.embeddings_tensor is not None else 0
-        )
-
     @classmethod
     def from_config(
         cls,
         module_name: str,
         peft_helper: PEFTHelper,
-        embeddings_tensor: torch.Tensor | None = None,
     ) -> "LoRALayerWeights":
         # lora_a and lora_b are set to None for config-based construction
         return cls(
@@ -76,7 +67,6 @@ def from_config(
             peft_helper.lora_alpha,
             None,
             None,
-            embeddings_tensor,
             peft_helper.vllm_lora_scaling_factor,
         )
 
@@ -89,7 +79,6 @@ def create_dummy_lora_weights(
         rank: int,
         dtype: torch.dtype,
         device: torch.types.Device,
-        embeddings_tensor_dim: int | None = None,
     ) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros(
@@ -99,24 +88,12 @@ def create_dummy_lora_weights(
             [output_dim, rank], dtype=dtype, device=device, pin_memory=pin_memory
         )
 
-        embeddings_tensor = (
-            torch.rand(
-                10,
-                embeddings_tensor_dim,
-                dtype=dtype,
-                device=device,
-                pin_memory=pin_memory,
-            )
-            if embeddings_tensor_dim
-            else None
-        )
         return cls(
             module_name,
             rank=rank,
             lora_alpha=1,
             lora_a=lora_a,
             lora_b=lora_b,
-            embeddings_tensor=embeddings_tensor,
         )
 
 
@@ -139,7 +116,6 @@ def __init__(
             lora_a=lora_a,
             lora_b=lora_b,
             scaling=scaling,  # type: ignore
-            embeddings_tensor=None,
         )
         self.lora_alphas = lora_alphas
         if scaling is None:
@@ -176,6 +152,59 @@ def pack(
         )
         return obj
 
+    @classmethod
+    def pack_moe(
+        cls, loras: GenericSequence[Optional["LoRALayerWeights"]], module_name: str
+    ) -> "PackedLoRALayerWeights":
+        """Pack a list of LoRAs into a single LoRA.
+
+        If LoRA is None, it signifies that the submodule does not have a LoRA.
+        """
+
+        first_lora = next(lora for lora in loras if lora is not None)
+        assert first_lora is not None
+        rank = first_lora.rank
+        lora_alpha = first_lora.lora_alpha
+        assert len(loras) % 3 == 0
+        w1_lora_a_lst = []
+        w2_lora_a_lst = []
+        w3_lora_a_lst = []
+        w1_lora_b_lst = []
+        w2_lora_b_lst = []
+        w3_lora_b_lst = []
+        # TODO: Consider the case where some experts don't have LoRA added.
+        for eid in range(len(loras) // 3):
+            w1_lora = loras[eid * 3]
+            w2_lora = loras[eid * 3 + 1]
+            w3_lora = loras[eid * 3 + 2]
+            assert w1_lora is not None
+            assert w2_lora is not None
+            assert w3_lora is not None
+
+            w1_lora_a_lst.append(w1_lora.lora_a)
+            w2_lora_a_lst.append(w2_lora.lora_a)
+            w3_lora_a_lst.append(w3_lora.lora_a)
+
+            w1_lora_b_lst.append(w1_lora.lora_b)
+            w2_lora_b_lst.append(w2_lora.lora_b)
+            w3_lora_b_lst.append(w3_lora.lora_b)
+
+        w1_lora_a = torch.stack(w1_lora_a_lst, dim=0)  # (num_experts,rank,input_size)
+        w2_lora_a = torch.stack(w2_lora_a_lst, dim=0)
+        w3_lora_a = torch.stack(w3_lora_a_lst, dim=0)
+        w1_lora_b = torch.stack(w1_lora_b_lst, dim=0)  # (num_experts,output_size,rank)
+        w2_lora_b = torch.stack(w2_lora_b_lst, dim=0)
+        w3_lora_b = torch.stack(w3_lora_b_lst, dim=0)
+
+        obj = cls(
+            module_name,
+            rank,
+            [lora_alpha, lora_alpha, lora_alpha],
+            [w1_lora_a, w2_lora_a, w3_lora_a],
+            [w1_lora_b, w2_lora_b, w3_lora_b],
+        )
+        return obj
+
     def optimize(self) -> "PackedLoRALayerWeights":
         """Optimize the LoRA by merging the scaling into lora_b."""
         for i in range(len(self.lora_b)):
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 02c252f15bfa..f568b8b9ba59 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -13,7 +13,7 @@
 
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA, FusedMoEWithLoRA, LoRAMapping
+from vllm.lora.layers import BaseLayerWithLoRA, FusedMoE3DWithLoRA, LoRAMapping
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
 from vllm.lora.punica_wrapper import get_punica_wrapper
@@ -21,11 +21,14 @@
     from_layer,
     from_layer_logits_processor,
     get_supported_lora_modules,
+    is_base_embeddding_weights,
+    is_moe_model,
     is_regex_target_modules,
     parse_fine_tuned_lora_name,
     process_packed_modules_mapping,
     replace_submodule,
 )
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
@@ -93,14 +96,6 @@ def clone(self, lora_model_id: int) -> "LoRAModel":
             loras=self.loras.copy(),
         )
 
-    @property
-    def extra_vocab_size(self) -> int:
-        return (
-            max(lora.extra_vocab_size for lora in self.loras.values())
-            if self.loras
-            else 0
-        )
-
     def get_lora(self, module_name: str) -> LoRALayerWeights | None:
         """Get LoRA for a given module by name"""
         return self.loras.get(module_name, None)
@@ -108,7 +103,6 @@ def get_lora(self, module_name: str) -> LoRALayerWeights | None:
     def check_lora_name(self, lora_name: str) -> bool:
         return lora_name in self.loras
 
-    # (yard1): TODO see if we can derive target_embedding_padding automatically
     @classmethod
     def from_lora_tensors(
         cls,
@@ -117,74 +111,55 @@ def from_lora_tensors(
         peft_helper: PEFTHelper,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        embeddings: dict[str, torch.Tensor] | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
     ) -> "LoRAModel":
         """Create a LoRAModel from a dictionary of tensors."""
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         loras: dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
+            if is_base_embeddding_weights(tensor_name):
+                continue
             module_name, is_lora_a = parse_fine_tuned_lora_name(
                 tensor_name, weights_mapper
             )
             if module_name not in loras:
-                lora_embeddings_tensor = None
-                if embeddings:
-                    assert embedding_modules is not None
-                    embeddings_module = next(
-                        (k for k in embedding_modules if k in module_name), None
-                    )
-                    if embeddings_module:
-                        lora_embeddings_tensor = embeddings[
-                            embedding_modules[embeddings_module]
-                        ].to(device=device, dtype=dtype)
-                        if pin_memory:
-                            lora_embeddings_tensor = lora_embeddings_tensor.pin_memory()
                 loras[module_name] = LoRALayerWeights.from_config(
-                    module_name, peft_helper, lora_embeddings_tensor
+                    module_name, peft_helper
                 )
 
             if is_lora_a:
+                if (
+                    "lora_embedding_A" in tensor_name
+                    and model_vocab_size is not None
+                    and model_vocab_size != tensor.shape[1]
+                ):
+                    raise RuntimeError(
+                        f"The embedding LoRA size({tensor.shape[1]}) must be consistent"
+                        f" with the base model's vocabulary size({model_vocab_size})."
+                    )
                 loras[module_name].lora_a = tensor.to(device=device, dtype=dtype)
                 if pin_memory:
                     loras[module_name].lora_a = loras[module_name].lora_a.pin_memory()
             else:
                 loras[module_name].lora_b = tensor.to(device=device, dtype=dtype)
-                assert embedding_padding_modules is not None
-                if (
-                    any(name in module_name for name in embedding_padding_modules)
-                    and target_embedding_padding is not None
-                ):
-                    lora_b = loras[module_name].lora_b
-                    assert target_embedding_padding >= lora_b.shape[0]
-                    addition = target_embedding_padding - lora_b.shape[0]
-                    loras[module_name].lora_b = torch.nn.functional.pad(
-                        lora_b, (0, 0, 0, addition)
-                    )
+
                 if pin_memory:
                     loras[module_name].lora_b = loras[module_name].lora_b.pin_memory()
 
-        for lora in loras.values():
-            lora.optimize()
-
         return cls(lora_model_id, peft_helper.r, loras)
 
     @classmethod
     def from_local_checkpoint(
         cls,
         lora_dir: str,
-        expected_lora_modules: list[str],
+        expected_lora_modules: set[str],
         peft_helper: PEFTHelper,
         *,
         lora_model_id: int | None = None,
         device: str = "cuda",
         dtype: torch.dtype | None = None,
-        target_embedding_padding: int | None = None,
-        embedding_modules: dict[str, str] | None = None,
-        embedding_padding_modules: list[str] | None = None,
+        model_vocab_size: int | None = None,
         weights_mapper: WeightsMapper | None = None,
         tensorizer_config_dict: dict | None = None,
     ) -> "LoRAModel":
@@ -206,27 +181,27 @@ def from_local_checkpoint(
         lora_tensor_path = os.path.join(lora_dir, "adapter_model.safetensors")
         lora_bin_file_path = os.path.join(lora_dir, "adapter_model.bin")
         lora_pt_file_path = os.path.join(lora_dir, "adapter_model.pt")
-        new_embeddings_tensor_path = os.path.join(
-            lora_dir, "new_embeddings.safetensors"
-        )
-        new_embeddings_bin_file_path = os.path.join(lora_dir, "new_embeddings.bin")
+
         tensors: dict[str, torch.Tensor] = {}
         unexpected_modules: list[list[str] | str] = []
 
         def check_unexpected_modules(modules: dict):
             for lora_module in modules.keys():  # noqa
-                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
-                # Handle FSDP file format where experts.base_layer is the
+                if is_base_embeddding_weights(lora_module):
+                    continue
+                # Handle PEFT file format where experts.base_layer is the
                 # gate_up_proj and experts is the down_proj
                 if "base_layer" in lora_module:
                     continue
+                module_name, _ = parse_fine_tuned_lora_name(lora_module, weights_mapper)
                 # Case for expert lora weights
                 if ".experts" in module_name:
-                    if not any(
-                        module_name.endswith(ele) for ele in expected_lora_modules
-                    ):
+                    expert_idx = module_name.find(".experts")
+                    expert_suffix = module_name[expert_idx + 1 :]
+                    if expert_suffix not in expected_lora_modules:
                         unexpected_modules.append(module_name)
-                elif module_name.split(".")[-1] not in expected_lora_modules:
+
+                elif module_name.rsplit(".", 1)[-1] not in expected_lora_modules:
                     unexpected_modules.append(module_name)
 
             if unexpected_modules:
@@ -300,24 +275,13 @@ def check_unexpected_modules(modules: dict):
         else:
             raise ValueError(f"{lora_dir} doesn't contain tensors")
 
-        embeddings = None
-        if os.path.isfile(new_embeddings_tensor_path):
-            embeddings = safetensors.torch.load_file(new_embeddings_tensor_path)
-        elif os.path.isfile(new_embeddings_bin_file_path):
-            embeddings = torch.load(
-                new_embeddings_bin_file_path, map_location=device, weights_only=True
-            )
-
         return cls.from_lora_tensors(
             lora_model_id=get_lora_id() if lora_model_id is None else lora_model_id,
             tensors=tensors,
             peft_helper=peft_helper,
             device=device,
             dtype=dtype,
-            embeddings=embeddings,
-            target_embedding_padding=target_embedding_padding,
-            embedding_modules=embedding_modules,
-            embedding_padding_modules=embedding_padding_modules,
+            model_vocab_size=model_vocab_size,
             weights_mapper=weights_mapper,
         )
 
@@ -381,7 +345,9 @@ def __init__(
         self.modules: dict[str, BaseLayerWithLoRA] = {}
         # Dict instead of a set for compatibility with LRUCache.
         self._last_mapping: LoRAMapping | None = None
+        self._is_3d_moe_model = is_moe_model(self.model) and self.model.is_3d_moe_weight
         self._create_lora_modules()
+
         self.model.lora_manager = self
 
     def __len__(self) -> int:
@@ -425,22 +391,36 @@ def activate_adapter(
         self.lora_index_to_id[index] = lora_model.id
         for module_name, module in self.modules.items():
             module_lora = self._get_lora_layer_weights(lora_model, module_name)
-            if module_lora:
-                # Note (gnovack) - If MOE lora weights are not split into
-                # num_experts chunks, we split them here
-                if isinstance(module, FusedMoEWithLoRA) and torch.is_tensor(
-                    module_lora.lora_a
-                ):
-                    # Handle FSDP file format where experts.base_layer is the
-                    # gate_up_proj and experts is the down_proj
-                    gate_up_proj_lora = self._get_lora_layer_weights(
-                        lora_model, module_name + ".base_layer"
-                    )
-
-                    assert gate_up_proj_lora is not None
-                    assert module_lora is not None
-
-                    down_proj_lora = module_lora
+            if not module_lora:
+                module.reset_lora(index)
+                continue
+            # Note (gnovack) - If MOE lora weights are not split into
+            # num_experts chunks, we split them here
+            if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor(
+                module_lora.lora_a
+            ):
+                # Handle PEFT file format where experts.base_layer is the
+                # gate_up_proj and experts is the down_proj
+                gate_up_proj_lora = self._get_lora_layer_weights(
+                    lora_model, module_name + ".base_layer"
+                )
+                down_proj_lora = module_lora
+                # FIXME Edge case where LoRA is not added to gate_up_proj
+                # or down_proj
+                assert gate_up_proj_lora is not None
+                assert down_proj_lora is not None
+                if self._is_3d_moe_model:
+                    module_lora.lora_a = [
+                        gate_up_proj_lora.lora_a,
+                        down_proj_lora.lora_a,
+                    ]
+                    module_lora.lora_b = [
+                        gate_up_proj_lora.lora_b,
+                        down_proj_lora.lora_b,
+                    ]
+                else:
+                    # Some 3D MoE models haven't added the `is_3d_moe_weight`
+                    # attribute yet, so fallback here
                     num_experts = module_lora.lora_a.shape[0] // module_lora.rank
 
                     gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0)
@@ -469,15 +449,12 @@ def activate_adapter(
 
                     module_lora.lora_a = lora_a
                     module_lora.lora_b = lora_b
+            module.set_lora(
+                index,
+                module_lora.lora_a,
+                module_lora.lora_b,
+            )
 
-                module.set_lora(
-                    index,
-                    module_lora.lora_a,
-                    module_lora.lora_b,
-                    module_lora.embeddings_tensor,
-                )
-            else:
-                module.reset_lora(index)
         return True
 
     def _deactivate_adapter(self, lora_id: int):
@@ -505,7 +482,6 @@ def _set_adapter_mapping(self, mapping: LoRAMapping) -> None:
             self.lora_index_to_id,
             self.lora_slots + 1,
             self.vocab_size,
-            self.lora_config.lora_extra_vocab_size,
         )
 
     def remove_all_adapters(self):
@@ -539,6 +515,13 @@ def _parent_module(module_name: str) -> str:
                 continue
             parts = module_name.split(".")[-1]
             packed_moduled_lst = self.packed_modules_mapping.get(parts, [])
+            if isinstance(module, FusedMoE):
+                # packed_moduled_lst is used here to just determine whether to
+                # instantiate FusedMoE3DWithLoRA or FusedMoEWithLoRA, and the
+                # difference between these two LoRA layers is whether the
+                # LoRA weights of w1 and w3 have already been fused on disk.
+
+                packed_moduled_lst = ["w13"] if self._is_3d_moe_model else ["w1", "w3"]
             new_module = replace_submodule(
                 self.model,
                 module_name,
@@ -587,6 +570,7 @@ def _parent_module(module_name: str) -> str:
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.
             new_module.set_mapping(self.punica_wrapper)
+        pass
 
     def register_module(self, module_name: str, module: "BaseLayerWithLoRA"):
         assert isinstance(module, BaseLayerWithLoRA), (
@@ -616,7 +600,6 @@ def create_dummy_lora(
                 if parts[-1] in embedding_modules:
                     input_dim = (
                         module.base_layer.org_vocab_size
-                        + self.lora_config.lora_extra_vocab_size
                         if hasattr(module.base_layer, "org_vocab_size")
                         else module.base_layer.weight.shape[1]
                     )
@@ -625,11 +608,6 @@ def create_dummy_lora(
                         if hasattr(module.base_layer, "embedding_dim")
                         else module.base_layer.weight.shape[0]
                     )
-                    embeddings_tensor_dim = (
-                        module.base_layer.embedding_dim
-                        if hasattr(module.base_layer, "embedding_dim")
-                        else module.base_layer.weight.shape[1]
-                    )
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
                         input_dim,
@@ -637,8 +615,31 @@ def create_dummy_lora(
                         rank,
                         module.lora_a_stacked[0].dtype,
                         "cpu",
-                        embeddings_tensor_dim=embeddings_tensor_dim,
                     )
+                    model.loras[module_name] = lora
+                elif module.__class__.__name__ == "FusedMoE3DWithLoRA":
+                    # Case for 3D moe model
+                    # w2
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        module.w2_input_size,
+                        module.w2_output_size,
+                        rank * module.w2_lora_a_stacked[0].shape[1],  # rank*num_experts
+                        module.w2_lora_a_stacked[0].dtype,
+                        "cpu",
+                    )
+                    model.loras[module_name] = lora
+                    # w13
+                    lora = LoRALayerWeights.create_dummy_lora_weights(
+                        module_name,
+                        module.w13_input_size,
+                        module.w13_output_size,
+                        rank
+                        * module.w13_lora_a_stacked[0].shape[1],  # rank*num_experts
+                        module.w13_lora_a_stacked[0].dtype,
+                        "cpu",
+                    )
+                    model.loras[module_name + ".base_layer"] = lora
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
@@ -648,6 +649,7 @@ def create_dummy_lora(
                         module.lora_a_stacked[0].dtype,
                         "cpu",
                     )
+                    model.loras[module_name] = lora
             else:
                 parts = module_name.split(".")
                 replacements = self.packed_modules_mapping[parts[-1]]
@@ -662,8 +664,11 @@ def create_dummy_lora(
                         "cpu",
                     )
                     subloras.append(lora)
-                lora = PackedLoRALayerWeights.pack(subloras)
-            model.loras[module_name] = lora
+                if module.__class__.__name__ == "FusedMoEWithLoRA":
+                    lora = PackedLoRALayerWeights.pack_moe(subloras, module_name)
+                else:
+                    lora = PackedLoRALayerWeights.pack(subloras)
+                model.loras[module_name] = lora
         return model
 
     def _match_target_modules(self, module_name: str):
@@ -722,13 +727,21 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
                 replaced_module_name = module_name.replace("model.", "")
                 if lora_model.check_lora_name(module_name):
                     module_name = replaced_module_name
-            lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
-                replacement_loras
-            )
+            if module_name.endswith(".experts"):
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack_moe(
+                    replacement_loras, module_name
+                )
+            else:
+                lora_model.loras[module_name] = PackedLoRALayerWeights.pack(
+                    replacement_loras
+                )
             # Remove the modules that have been replaced.
             for module in replaced_module:
                 lora_model.loras.pop(module, None)
 
+        for lora in lora_model.loras.values():
+            lora.optimize()
+
     def _get_lora_layer_weights(
         self, lora_model: LoRAModel, module_name: str
     ) -> LoRALayerWeights | None:
diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
index 893972144e99..413ee8ecbbf9 100644
--- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
+++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py
@@ -3,6 +3,10 @@
 
 import torch
 
+from vllm.distributed import (
+    tensor_model_parallel_all_gather,
+    tensor_model_parallel_all_reduce,
+)
 from vllm.triton_utils import tl, triton
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -154,7 +158,7 @@ def _fused_moe_lora_kernel(
         k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K)
         # pre-fetch lora weight
         b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0)
-        # GDC wait waits for ALL programs in the the prior kernel to complete
+        # GDC wait waits for ALL programs in the prior kernel to complete
         # before continuing.
         if USE_GDC and not IS_PRIMARY:
             tl.extra.cuda.gdc_wait()
@@ -311,6 +315,7 @@ def _fused_moe_lora_expand(
     num_stages: int,
     split_k: int,
     mul_routed_weight: bool = False,
+    offset: int = 0,
 ) -> None:
     b_ptr = _get_ptr(lora_b_stacked, device)
     K = max_lora_rank
@@ -380,7 +385,7 @@ def _fused_moe_lora_expand(
         **expand_config,
     )
     for i in range(num_slices):
-        output[:, :, i * N : (i + 1) * N] += b_intermediate_cache1[i]
+        output[:, :, i * N + offset : (i + 1) * N + offset] += b_intermediate_cache1[i]
 
 
 @torch.inference_mode()
@@ -416,6 +421,8 @@ def _fused_moe_lora(
     expand_num_stages: int,
     expand_split_k: int,
     mul_routed_weight: bool = False,
+    fully_sharded: bool = False,
+    offset: int = 0,
 ) -> None:
     assert len(lora_a_stacked) == len(lora_b_stacked) > 0
     assert (
@@ -430,7 +437,6 @@ def _fused_moe_lora(
         == expert_ids.shape[0]
         == num_tokens_post_padded.shape[0]
     )
-    assert len(lora_b_stacked) * lora_b_stacked[0].shape[-2] == output.shape[-1]
     assert output.shape[0] == topk_weights.shape[0]
     assert top_k_num == topk_weights.shape[1]
     device = qcurr_hidden_states.device
@@ -480,6 +486,19 @@ def _fused_moe_lora(
         mul_routed_weight,
     )
 
+    if fully_sharded:
+        if max_lora_rank == w1_lora_b_stacked.shape[-1]:
+            a_intermediate_cache1 = tensor_model_parallel_all_reduce(
+                a_intermediate_cache1
+            )
+        else:
+            a_intermediate_cache1 = tensor_model_parallel_all_gather(
+                a_intermediate_cache1
+            )
+
+            # reset max_lora_rank to the full rank after allgather
+            max_lora_rank = a_intermediate_cache1.shape[-1]
+
     _fused_moe_lora_expand(
         output,
         a_intermediate_cache1,
@@ -510,6 +529,7 @@ def _fused_moe_lora(
         expand_num_stages,
         expand_split_k,
         mul_routed_weight,
+        offset,
     )
 
 
diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py
index b6186e856152..47c42b095534 100644
--- a/vllm/lora/punica_wrapper/punica_base.py
+++ b/vllm/lora/punica_wrapper/punica_base.py
@@ -31,7 +31,6 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ) -> None:
         """
@@ -172,8 +171,11 @@ def _update_base_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
+        # NOTE We have remove lora extra vocab support for now. So we set
+        # extra_vocab_size always to 0, and extra_vocab_size will be removed.
+
+        extra_vocab_size = 0
         (
             base_indices,
             sampler_indices,
@@ -285,12 +287,9 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
         if mapping.is_prefill:
             # Update metadata required for prefill-related operators.
@@ -471,8 +470,8 @@ def add_lora_fused_moe(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        lora_a_stacked: list[torch.Tensor],
-        lora_b_stacked: list[torch.Tensor],
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
         topk_weights: torch.Tensor,
         sorted_token_ids: torch.Tensor,
         expert_ids: torch.Tensor,
@@ -483,6 +482,8 @@ def add_lora_fused_moe(
         expand_config,
         adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
+        fully_sharded: bool = False,
+        offset: int = 0,
     ):
         """
         Performs a fused forward computation for LoRA of
diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py
index ede50a48af98..ef4b4ab7c349 100644
--- a/vllm/lora/punica_wrapper/punica_gpu.py
+++ b/vllm/lora/punica_wrapper/punica_gpu.py
@@ -65,13 +65,10 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
         # Prepare cuda kernel metadata tensors
         self.token_mapping_meta.prepare_tensors(self.token_lora_indices)
@@ -363,8 +360,8 @@ def add_lora_fused_moe(
         self,
         y: torch.Tensor,
         x: torch.Tensor,
-        lora_a_stacked: list[torch.Tensor],
-        lora_b_stacked: list[torch.Tensor],
+        lora_a_stacked: tuple[torch.Tensor, ...],
+        lora_b_stacked: tuple[torch.Tensor, ...],
         topk_weights: torch.Tensor,
         sorted_token_ids: torch.Tensor,
         expert_ids: torch.Tensor,
@@ -375,6 +372,8 @@ def add_lora_fused_moe(
         expand_config,
         adapter_enabled: torch.Tensor,
         mul_routed_weight=False,
+        fully_sharded: bool = False,
+        offset: int = 0,
     ):
         """
         Performs a fused forward computation for LoRA of Mixture-of-Experts (MoE) layer.
@@ -408,4 +407,6 @@ def add_lora_fused_moe(
             expand_config.get("NUM_STAGES", 3),
             expand_config.get("SPLIT_K", 1),
             mul_routed_weight,
+            fully_sharded,
+            offset,
         )
diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py
index 090878dcd254..0888772db54e 100644
--- a/vllm/lora/punica_wrapper/punica_tpu.py
+++ b/vllm/lora/punica_wrapper/punica_tpu.py
@@ -292,7 +292,6 @@ def _update_base_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
     ):
         # Make sure we don't accidentally collect outside operations
         torch_xla.sync()
@@ -313,7 +312,7 @@ def _update_base_metadata(
             lora_index_to_id,
             max_loras,
             vocab_size,
-            extra_vocab_size,
+            0,  # extra_vocab_size
             "cpu",
         )
         self._token_lora_indices = self._pad_to_shape(
diff --git a/vllm/lora/punica_wrapper/punica_xpu.py b/vllm/lora/punica_wrapper/punica_xpu.py
index b95087d0ff83..00c00782896c 100644
--- a/vllm/lora/punica_wrapper/punica_xpu.py
+++ b/vllm/lora/punica_wrapper/punica_xpu.py
@@ -43,13 +43,10 @@ def update_metadata(
         lora_index_to_id: list[int | None],
         max_loras: int,
         vocab_size: int,
-        extra_vocab_size: int,
         **kwargs,
     ):
         self.is_prefill = mapping.is_prefill
-        self._update_base_metadata(
-            mapping, lora_index_to_id, max_loras, vocab_size, extra_vocab_size
-        )
+        self._update_base_metadata(mapping, lora_index_to_id, max_loras, vocab_size)
 
     def _get_token_lora_indices(self, x: torch.Tensor) -> torch.IntTensor:
         return torch.narrow(self._token_lora_indices, 0, 0, x.size(0))
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 0f43ff06d8f2..47484b2b984d 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -23,6 +23,7 @@
     BaseLayerWithLoRA,
     ColumnParallelLinearWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
+    FusedMoE3DWithLoRA,
     FusedMoEWithLoRA,
     LogitsProcessorWithLoRA,
     MergedColumnParallelLinearWithLoRA,
@@ -62,6 +63,7 @@
     MergedQKVParallelLinearWithShardedLoRA,
     RowParallelLinearWithShardedLoRA,
     FusedMoEWithLoRA,
+    FusedMoE3DWithLoRA,
 }
 
 
@@ -166,8 +168,17 @@ def parse_fine_tuned_lora_name(
     raise ValueError(f"{name} is unsupported LoRA weight")
 
 
+def is_base_embeddding_weights(name: str) -> bool:
+    # hardcoded subfixes for input & output embedding weights
+    embedding_suffixes = (
+        ".embed_tokens.base_layer.weight",
+        ".lm_head.base_layer.weight",
+    )
+    return name.endswith(embedding_suffixes)
+
+
 def is_regex_target_modules(
-    load_modules: str | list[str], expected_lora_modules: list[str]
+    load_modules: str | list[str], expected_lora_modules: set[str]
 ) -> bool:
     """
     PEFT supports passing `target_modules` in the form of regular expressions,
@@ -183,8 +194,8 @@ def is_valid_regex(pattern):
         except re.error:
             return False
 
-    def is_subset(sub_list, full_list):
-        return set(sub_list).issubset(set(full_list))
+    def is_subset(sub_list, full_set):
+        return set(sub_list).issubset(full_set)
 
     # Similar to PEFT's processing logic, regex-related operations are only
     #  executed when the load_modules is a `str`.
@@ -278,10 +289,12 @@ def process_packed_modules_mapping(model: nn.Module) -> dict[str, list[str]]:
             # the expert indices are expanded based on the configured number
             # of routed experts.
             packed_modules_mapping = get_packed_modules_mapping(model)
-
-            packed_modules_mapping["experts"] = [
-                weight_name.rstrip(".") for _, weight_name, _, _ in moe_packed_mapping
-            ]
+            if not model.is_3d_moe_weight:
+                # 3D MoE LoRA does not need `packed_modules_mapping`
+                packed_modules_mapping["experts"] = [
+                    weight_name.rstrip(".")
+                    for _, weight_name, _, _ in moe_packed_mapping
+                ]
 
             return packed_modules_mapping
         else:
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index b85151f2c759..7d77ba7247ef 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -34,12 +34,10 @@ def __init__(
         vllm_config: VllmConfig,
         device: torch.device,
         embedding_modules: dict[str, str],
-        embedding_padding_modules: list[str],
         lora_model_cls: type[LoRAModel] = LoRAModel,
     ):
         self._lora_model_cls = lora_model_cls
         self.embedding_modules = embedding_modules
-        self.embedding_padding_modules = embedding_padding_modules
         self._cached_dummy_lora: None | Literal[False] | LoRAModel = False
         self.max_num_seqs = vllm_config.scheduler_config.max_num_seqs
         self.max_num_batched_tokens = (
@@ -88,15 +86,15 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
         try:
             supported_lora_modules = self._adapter_manager.supported_lora_modules
             packed_modules_mapping = self._adapter_manager.packed_modules_mapping
-            expected_lora_modules: list[str] = []
+            expected_lora_lst: list[str] = []
             for module in supported_lora_modules:
                 if module in packed_modules_mapping:
-                    expected_lora_modules.extend(packed_modules_mapping[module])
+                    expected_lora_lst.extend(packed_modules_mapping[module])
                 else:
-                    expected_lora_modules.append(module)
+                    expected_lora_lst.append(module)
                 if module == "experts":
-                    expected_lora_modules.append(module)
-            expected_lora_modules = list(set(expected_lora_modules))
+                    expected_lora_lst.append(module)
+            expected_lora_modules = set(expected_lora_lst)
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             peft_helper = PEFTHelper.from_local_dir(
@@ -121,10 +119,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 lora_model_id=lora_request.lora_int_id,
                 device="cpu",
                 dtype=self.lora_config.lora_dtype,
-                target_embedding_padding=self.vocab_size
-                + self.lora_config.lora_extra_vocab_size,
-                embedding_modules=self.embedding_modules,
-                embedding_padding_modules=self.embedding_padding_modules,
+                model_vocab_size=self.vocab_size,
                 tensorizer_config_dict=lora_request.tensorizer_config_dict,
                 weights_mapper=hf_to_vllm_mapper,
             )
@@ -143,12 +138,6 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             # For BadRequestError
             raise e
 
-        if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size:
-            raise ValueError(
-                f"LoRA added vocab size {lora.extra_vocab_size} "
-                f"is greater than lora_extra_vocab_size "
-                f"{self.lora_config.lora_extra_vocab_size}."
-            )
         return lora
 
     def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool:
diff --git a/vllm/model_executor/layers/attention_layer_base.py b/vllm/model_executor/layers/attention_layer_base.py
index ffbef470b186..a60cf787135c 100644
--- a/vllm/model_executor/layers/attention_layer_base.py
+++ b/vllm/model_executor/layers/attention_layer_base.py
@@ -3,14 +3,11 @@
 """Base class for attention-like layers."""
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.kv_cache_interface import KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class AttentionLayerBase(ABC):
     """
@@ -22,7 +19,7 @@ class AttentionLayerBase(ABC):
     """
 
     @abstractmethod
-    def get_attn_backend(self) -> type["AttentionBackend"]:
+    def get_attn_backend(self) -> type[AttentionBackend]:
         """Get the attention backend class for this layer."""
         pass
 
diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py
index 746a543ab827..4154122636dc 100644
--- a/vllm/model_executor/layers/batch_invariant.py
+++ b/vllm/model_executor/layers/batch_invariant.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import contextlib
 import os
-from collections import namedtuple
 from collections.abc import Callable
-from functools import cache
 from typing import Any
 
 import torch
@@ -218,6 +215,139 @@ def grid(META):
     return c
 
 
+@triton.jit
+def bmm_kernel(
+    a_ptr,  # (*, ) pointer to A, (B, M, K)
+    b_ptr,  # (*, ) pointer to B, (B, K, N)
+    c_ptr,  # (*, ) pointer to C, (B, M, N)
+    B,  # int, batch size
+    M,  # int, output rows
+    N,  # int, output cols
+    K,  # int, reduction dim
+    stride_ab,
+    stride_am,
+    stride_ak,
+    stride_bb,
+    stride_bk,
+    stride_bn,
+    stride_cb,
+    stride_cm,
+    stride_cn,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    A_LARGE: tl.constexpr,
+    B_LARGE: tl.constexpr,
+    C_LARGE: tl.constexpr,
+):
+    """Batched GEMM: (B, M, K) x (B, K, N) -> (B, M, N)
+
+    Each program computes one (batch_idx, tile_m, tile_n) tile, accumulating
+    along K in a fixed order to preserve batch invariance.
+    """
+    pid_b = tl.program_id(0)
+    pid = tl.program_id(1)
+
+    if pid_b >= B:
+        return
+
+    # number of tiles along M / N
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    if pid_m >= num_pid_m or pid_n >= num_pid_n:
+        return
+
+    # offs_m / offs_n: raw global row/col indices for this tile
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    # masks for valid logical rows/cols within (M, N)
+    mask_m = offs_m < M  # [BLOCK_SIZE_M]
+    mask_n = offs_n < N  # [BLOCK_SIZE_N]
+
+    if A_LARGE or B_LARGE or C_LARGE:
+        offs_m = offs_m.to(tl.int64)
+        offs_n = offs_n.to(tl.int64)
+
+    offs_m = tl.where(mask_m, offs_m, 0)
+    offs_n = tl.where(mask_n, offs_n, 0)
+
+    # hint for triton contiguous memory
+    offs_m = tl.max_contiguous(tl.multiple_of(offs_m, BLOCK_SIZE_M), BLOCK_SIZE_M)
+    offs_n = tl.max_contiguous(tl.multiple_of(offs_n, BLOCK_SIZE_N), BLOCK_SIZE_N)
+
+    # base pointers for current batch, shape-wise:
+    #   a_batch_ptr points to A[pid_b, 0, 0]
+    #   b_batch_ptr points to B[pid_b, 0, 0]
+    #   c_batch_ptr points to C[pid_b, 0, 0]
+    a_batch_ptr = a_ptr + pid_b * stride_ab
+    b_batch_ptr = b_ptr + pid_b * stride_bb
+    c_batch_ptr = c_ptr + pid_b * stride_cb
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    # number of K-blocks this tile iterates over
+    k_tiles = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_k_mask = tl.arange(0, BLOCK_SIZE_K)
+
+    for ki in range(k_tiles):
+        if A_LARGE or B_LARGE:
+            # offs_k: [BLOCK_SIZE_K], global K indices
+            offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+        else:
+            offs_k = ki * BLOCK_SIZE_K + tl.arange(0, BLOCK_SIZE_K)
+
+        # a_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_K]
+        #   element (i, j) points to A[pid_b, offs_m[i], offs_k[j]]
+        a_ptrs = a_batch_ptr + (
+            offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
+        )
+        # b_ptrs: [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        #   element (i, j) points to B[pid_b, offs_k[i], offs_n[j]]
+        b_ptrs = b_batch_ptr + (
+            offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn
+        )
+
+        # valid K lanes for this block
+        k_valid = offs_k_mask < (K - ki * BLOCK_SIZE_K)
+        # A mask within (M, K): [BLOCK_SIZE_M, BLOCK_SIZE_K]
+        a_mask = mask_m[:, None] & k_valid[None, :]
+        # B mask within (K, N): [BLOCK_SIZE_K, BLOCK_SIZE_N]
+        b_mask = k_valid[:, None] & mask_n[None, :]
+
+        # a: [BLOCK_SIZE_M, BLOCK_SIZE_K] from A[offs_m, offs_k]
+        a = tl.load(
+            a_ptrs,
+            mask=a_mask,
+            other=0.0,
+        )
+        # b: [BLOCK_SIZE_K, BLOCK_SIZE_N] from B[offs_k, offs_n]
+        b = tl.load(
+            b_ptrs,
+            mask=b_mask,
+            other=0.0,
+        )
+        accumulator = tl.dot(a, b, accumulator)
+
+    # c_m / c_n: [BLOCK_SIZE_M] / [BLOCK_SIZE_N], row/col indices for C
+    c_m = offs_m
+    c_n = offs_n
+    if C_LARGE:
+        c_m = c_m.to(tl.int64)
+        c_n = c_n.to(tl.int64)
+
+    # c_ptrs: [BLOCK_SIZE_M, BLOCK_SIZE_N]
+    #   element (i, j) points to C[pid_b, c_m[i], c_n[j]]
+    c_ptrs = c_batch_ptr + stride_cm * c_m[:, None] + stride_cn * c_n[None, :]
+    # mask out elements that fall outside logical (M, N) range
+    c_mask = mask_m[:, None] & mask_n[None, :]
+    # cast FP32 accumulator back to original dtype of C
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
 @triton.jit
 def _log_softmax_kernel(
     input_ptr,
@@ -529,23 +659,91 @@ def matmul_batch_invariant(a, b, *, out=None):
 
 def bmm_batch_invariant(a, b, *, out=None):
     # Batched matrix multiply: (B, M, K) x (B, K, N) -> (B, M, N)
-    # Process each batch separately with our persistent kernel
-    if a.ndim == 3 and b.ndim == 3:
-        results = []
-        for i in range(a.shape[0]):
-            results.append(matmul_persistent(a[i], b[i]))
-        result = torch.stack(results, dim=0)
-
-        if out is not None:
-            out.copy_(result)
-            return out
-        return result
-    else:
+    if not (a.ndim == 3 and b.ndim == 3):
         raise ValueError(
             f"bmm_batch_invariant expects 3D tensors, "
             f"got shapes {a.shape} and {b.shape}"
         )
 
+    if a.shape[0] != b.shape[0]:
+        raise ValueError(
+            f"Batch dimensions of tensors must match, "
+            f"but got {a.shape[0]} and {b.shape[0]}."
+        )
+    if a.shape[2] != b.shape[1]:
+        raise ValueError(
+            f"Incompatible inner dimensions for matmul: got {a.shape} and {b.shape}."
+        )
+    if a.dtype != b.dtype:
+        raise ValueError(f"Incompatible dtypes: got {a.dtype} and {b.dtype}.")
+
+    B, M, K = a.shape
+    _, _, N = b.shape
+    dtype = a.dtype
+
+    if out is None:
+        c = torch.empty((B, M, N), device=a.device, dtype=dtype)
+    else:
+        assert out.shape == (B, M, N), "out tensor has incorrect shape"
+        assert out.dtype == dtype and out.device == a.device, "out tensor mismatch"
+        c = out
+
+    configs = {
+        torch.bfloat16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 64,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float16: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 256,
+            "BLOCK_SIZE_K": 64,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+        torch.float32: {
+            "BLOCK_SIZE_M": 128,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 32,
+            "num_stages": 3,
+            "num_warps": 8,
+        },
+    }
+
+    cfg = configs[dtype]
+    # grid = (B, num_tiles_per_matrix)
+    grid = (
+        B,
+        triton.cdiv(M, cfg["BLOCK_SIZE_M"]) * triton.cdiv(N, cfg["BLOCK_SIZE_N"]),
+    )
+
+    bmm_kernel[grid](
+        a,
+        b,
+        c,
+        B,
+        M,
+        N,
+        K,
+        a.stride(0),
+        a.stride(1),
+        a.stride(2),
+        b.stride(0),
+        b.stride(1),
+        b.stride(2),
+        c.stride(0),
+        c.stride(1),
+        c.stride(2),
+        A_LARGE=a.numel() > 2**31,
+        B_LARGE=b.numel() > 2**31,
+        C_LARGE=c.numel() > 2**31,
+        **cfg,
+    )
+
+    return c
+
 
 def addmm_batch_invariant(bias, a, b):
     return matmul_persistent(a, b, bias=bias)
@@ -725,10 +923,6 @@ def linear_batch_invariant(input, weight, bias=None):
 _original_cublaslt_workspace_size = None
 
 
-def is_batch_invariant_mode_enabled():
-    return _batch_invariant_MODE
-
-
 def enable_batch_invariant_mode():
     global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
     global _original_fp16_reduction_precision, _original_bf16_reduction_precision
@@ -791,83 +985,19 @@ def enable_batch_invariant_mode():
     torch.backends.cuda.preferred_blas_library(backend="cublaslt")
 
 
-def disable_batch_invariant_mode():
-    global _batch_invariant_MODE, _batch_invariant_LIB, _original_torch_bmm
-    global _original_fp16_reduction_precision, _original_bf16_reduction_precision
-    global _original_cublas_workspace_cfg, _original_cublaslt_workspace_size
-    if not _batch_invariant_MODE:
-        return
-
-    if _batch_invariant_LIB is not None:
-        _batch_invariant_LIB._destroy()
-    if _original_torch_bmm is not None:
-        torch.bmm = _original_torch_bmm
-        _original_torch_bmm = None
-
-    if _original_bf16_reduction_precision is not None:
-        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = (
-            _original_bf16_reduction_precision
-        )
-        _original_bf16_reduction_precision = None
-    if _original_fp16_reduction_precision is not None:
-        torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = (
-            _original_fp16_reduction_precision
-        )
-        _original_fp16_reduction_precision = None
-
-    torch.backends.cuda.preferred_blas_library(backend="default")
-
-    if not is_torch_equal_or_newer("2.10.0.dev"):
-        # Set cublas env vars to previous results. If previous results are None,
-        # that means the env vars were not set, so we should remove them.
-        if _original_cublas_workspace_cfg:
-            os.environ["CUBLAS_WORKSPACE_CONFIG"] = _original_cublas_workspace_cfg
-        elif "CUBLAS_WORKSPACE_CONFIG" in os.environ:
-            del os.environ["CUBLAS_WORKSPACE_CONFIG"]
-
-        if _original_cublaslt_workspace_size:
-            os.environ["CUBLASLT_WORKSPACE_SIZE"] = _original_cublaslt_workspace_size
-        elif "CUBLASLT_WORKSPACE_SIZE" in os.environ:
-            del os.environ["CUBLASLT_WORKSPACE_SIZE"]
-
-    _original_cublas_workspace_cfg = None
-    _original_cublaslt_workspace_size = None
-
-    _batch_invariant_MODE = False
-    _batch_invariant_LIB = None
-
-
-@contextlib.contextmanager
-def set_batch_invariant_mode(enabled: bool = True):
-    global _batch_invariant_MODE, _batch_invariant_LIB
-    old_data = (_batch_invariant_MODE, _batch_invariant_LIB)
-    if enabled:
-        enable_batch_invariant_mode()
-    else:
-        disable_batch_invariant_mode()
-    yield
-    if _batch_invariant_LIB is not None:
-        _batch_invariant_LIB._destroy()
-    _batch_invariant_MODE, _batch_invariant_LIB = old_data
+def _read_vllm_batch_invariant() -> bool:
+    val = os.getenv("VLLM_BATCH_INVARIANT", "0")
+    try:
+        return int(val) != 0
+    except ValueError:
+        return False
 
 
-AttentionBlockSize = namedtuple("AttentionBlockSize", ["block_m", "block_n"])
+VLLM_BATCH_INVARIANT: bool = _read_vllm_batch_invariant()
 
 
-def get_batch_invariant_attention_block_size() -> AttentionBlockSize:
-    return AttentionBlockSize(block_m=16, block_n=16)
-
-
-@cache
-def vllm_is_batch_invariant():
-    env_key = "VLLM_BATCH_INVARIANT"
-    is_overridden = False
-    val = os.getenv(env_key, "0")
-    try:
-        is_overridden = int(val) != 0
-    except ValueError:
-        is_overridden = False
-    return is_overridden
+def vllm_is_batch_invariant() -> bool:
+    return VLLM_BATCH_INVARIANT
 
 
 def override_envs_for_invariance():
@@ -876,26 +1006,26 @@ def override_envs_for_invariance():
         "FLASH_ATTN",  # best supported backend
         "FLASHINFER",
         "FLASH_ATTN_MLA",
-        "FLASHINFER_MLA",
-        "TRITON_MLA",
         # Not yet supported MLA backends
         # "FLASHMLA",
         # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance
+        # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967
+        # "TRITON_MLA",
     ]
     if curr_attn_backend not in supported_backends:
-        warning = (
-            "Forcibly updating attention backend to"
-            f" {supported_backends[0]} for batch_invariant. "
-            f" Supported backends: {supported_backends}."
+        error = (
+            "VLLM batch_invariant mode requires an attention backend in "
+            f"{supported_backends}, but got '{curr_attn_backend}'. "
+            "Please set the 'VLLM_ATTENTION_BACKEND' environment variable "
+            "to one of the supported backends before enabling batch_invariant."
         )
-        logger.warning_once(warning)
-        os.environ["VLLM_ATTENTION_BACKEND"] = supported_backends[0]
+        raise RuntimeError(error)
     if os.environ["VLLM_ATTENTION_BACKEND"] != supported_backends[0]:
         warning = (
             "You are using a decode-invariant form of batch invariance. "
             "This will not be invariant between prefill and decode."
         )
-        logger.warning_once(warning)
+        logger.warning_once(warning, scope="local")
     os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
 
     os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
@@ -923,5 +1053,6 @@ def init_batch_invariance():
         enable_batch_invariant_mode()
 
         # Disable TF32 for batch invariance - it causes non-deterministic rounding
-        torch.backends.cuda.matmul.allow_tf32 = False
-        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.fp32_precision = "ieee"
+        torch.backends.cudnn.conv.fp32_precision = "ieee"
+        torch.backends.cudnn.rnn.fp32_precision = "ieee"
diff --git a/vllm/model_executor/layers/conv.py b/vllm/model_executor/layers/conv.py
new file mode 100644
index 000000000000..8d51e5bd9920
--- /dev/null
+++ b/vllm/model_executor/layers/conv.py
@@ -0,0 +1,256 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Conv Layer Class."""
+
+import math
+from typing import Literal
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm.model_executor.custom_op import CustomOp
+from vllm.utils.torch_utils import is_torch_equal
+
+
+class ConvLayerBase(CustomOp):
+    """Conv layer base class."""
+
+    num_dim: int
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int | tuple[int, ...],
+        stride: int | tuple[int, ...] = 1,
+        padding: int | tuple[int, ...] | Literal["same", "valid"] = 0,
+        dilation: int | tuple[int, ...] = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: Literal["zeros", "reflect", "replicate", "circular"] = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        super().__init__()
+
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+
+        valid_padding_strings = {"same", "valid"}
+        if isinstance(padding, str) and padding not in valid_padding_strings:
+            raise ValueError(
+                f"Invalid padding string '{padding}'. "
+                f"Expected one of {valid_padding_strings}."
+            )
+
+        if padding == "same":
+            padding = (
+                kernel_size // 2
+                if isinstance(kernel_size, int)
+                else tuple(k // 2 for k in kernel_size)
+            )
+        elif padding == "valid":
+            padding = 0
+
+        kernel_size = (
+            (kernel_size,) * self.num_dim
+            if isinstance(kernel_size, int)
+            else kernel_size
+        )
+        stride = (stride,) * self.num_dim if isinstance(stride, int) else stride
+        padding = (padding,) * self.num_dim if isinstance(padding, int) else padding
+        dilation = (dilation,) * self.num_dim if isinstance(dilation, int) else dilation
+
+        if padding == "same" and any(s != 1 for s in stride):
+            raise ValueError("padding='same' is not supported for strided convolutions")
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.padding_mode = padding_mode
+
+        self.enable_linear = (
+            (self.kernel_size == self.stride)
+            and not any(self.padding)
+            and self.groups == 1
+        )
+        self.input_size = in_channels * math.prod(self.kernel_size)
+
+        self.weight = nn.Parameter(
+            torch.empty(
+                out_channels,
+                in_channels // groups,
+                *kernel_size,
+                dtype=params_dtype,
+            ),
+        )
+
+        if bias:
+            self.bias = nn.Parameter(torch.empty(self.out_channels, dtype=params_dtype))
+        else:
+            self.register_parameter("bias", None)
+
+    def extra_repr(self) -> str:
+        s = f"in_channels={self.in_channels}, "
+        s += f"out_channels={self.out_channels}, "
+        s += f"kernel_size={self.kernel_size}, "
+        s += f"stride={self.stride}, "
+        s += f"padding={self.padding}, "
+        s += f"bias={self.bias is not None}"
+        return s
+
+
+@CustomOp.register("conv2d")
+class Conv2dLayer(ConvLayerBase):
+    """Conv layer with Conv2d."""
+
+    num_dim = 2
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        B, C, H, W = x.shape
+        K1, K2 = self.kernel_size
+        H, W = H // K1, W // K2
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2)
+        x = x.permute(0, 2, 3, 1, 4, 5).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, H, W, self.out_channels).permute(0, 3, 1, 2)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 4
+        x = F.conv2d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, height, width)"""
+        assert x.dim() == 4
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # By default, we use CUDNN's convolution ops with optimization.
+        return self._forward_conv(x)
+
+
+class CausalConv2dLayer(Conv2dLayer):
+    """
+    A causal version of nn.Conv2d where each location in the 2D matrix would
+    have no access to locations on its right or down
+    All arguments are the same as nn.Conv2d except padding which should be
+    set as None
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int,
+        padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = "zeros",
+        *,
+        params_dtype: torch.dtype | None = None,
+    ) -> None:
+        if padding is not None:
+            raise ValueError(
+                "Argument padding should be set to None for CausalConv2dLayer."
+            )
+        self._left_padding: int = kernel_size - 1
+        self._right_padding: int = stride - 1
+        padding = 0
+
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            padding_mode,
+            params_dtype=params_dtype,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = F.pad(x, pad=(self._left_padding, self._right_padding, 0, 0))
+        x = super().forward(x)
+        return x
+
+
+@CustomOp.register("conv3d")
+class Conv3dLayer(ConvLayerBase):
+    """Conv layer with Conv3d."""
+
+    num_dim = 3
+
+    def _forward_mulmat(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        B, C, T, H, W = x.shape
+        K1, K2, K3 = self.kernel_size
+        T, H, W = T // K1, H // K2, W // K3
+        x = x.unfold(2, K1, K1).unfold(3, K2, K2).unfold(4, K3, K3)
+        x = x.permute(0, 2, 3, 4, 1, 5, 6, 7).reshape(-1, self.input_size)
+        x = F.linear(
+            x,
+            self.weight.view(self.out_channels, self.input_size),
+            self.bias,
+        )
+        x = x.view(B, T, H, W, self.out_channels).permute(0, 4, 1, 2, 3)
+        return x
+
+    def _forward_conv(self, x: torch.Tensor) -> torch.Tensor:
+        assert x.dim() == 5
+        x = F.conv3d(
+            x,
+            self.weight,
+            self.bias,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            groups=self.groups,
+        )
+        return x
+
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+        """Expected input shape: (batch_size, in_channels, time, height, width)"""
+        if self.enable_linear:
+            return self._forward_mulmat(x)
+        else:
+            return self._forward_conv(x)
+
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch2.9.0 disabled CUDNN's Conv3D, which caused a
+        # significant performance regression.
+        # See: https://github.com/vllm-project/vllm/issues/27406
+        # and https://github.com/pytorch/pytorch/issues/166122
+        # By default, we use CUDNN's convolution ops with optimization.
+        if self.enable_linear and is_torch_equal("2.9.0"):
+            return self._forward_mulmat(x)
+        return self._forward_conv(x)
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
index 53d98d0650b4..669abcb3d6ff 100644
--- a/vllm/model_executor/layers/fused_moe/__init__.py
+++ b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -18,6 +18,9 @@
     FusedMoEPrepareAndFinalize,
 )
 from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
+    UnquantizedFusedMoEMethod,
+)
 from vllm.model_executor.layers.fused_moe.utils import activation_without_mul
 from vllm.triton_utils import HAS_TRITON
 
@@ -41,6 +44,7 @@ def get_config() -> dict[str, Any] | None:
     "FusedMoE",
     "FusedMoEConfig",
     "FusedMoEMethodBase",
+    "UnquantizedFusedMoEMethod",
     "FusedMoeWeightScaleSupported",
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py
index 2dd625054339..86c50f39f007 100644
--- a/vllm/model_executor/layers/fused_moe/all2all_utils.py
+++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py
@@ -67,6 +67,7 @@ def maybe_roundup_layer_hidden_size(
 def maybe_make_prepare_finalize(
     moe: FusedMoEConfig,
     quant_config: FusedMoEQuantConfig | None,
+    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
 ) -> FusedMoEPrepareAndFinalize | None:
     if not moe.moe_parallel_config.use_all2all_kernels:
         return None
@@ -134,6 +135,13 @@ def maybe_make_prepare_finalize(
 
     elif moe.use_deepep_ll_kernels:
         assert quant_config is not None
+        global_to_physical = physical_to_global = local_expert_global_ids = None
+        if routing_tables is not None:
+            (
+                global_to_physical,
+                physical_to_global,
+                local_expert_global_ids,
+            ) = routing_tables
         all_to_all_args = dict(
             max_num_tokens_per_dp_rank=moe.max_num_tokens,
             token_hidden_size=moe.hidden_dim,
@@ -155,6 +163,9 @@ def maybe_make_prepare_finalize(
             max_tokens_per_rank=moe.max_num_tokens,
             num_dispatchers=all2all_manager.world_size,
             use_fp8_dispatch=use_fp8_dispatch,
+            global_to_physical=global_to_physical,
+            physical_to_global=physical_to_global,
+            local_expert_global_ids=local_expert_global_ids,
         )
 
     return prepare_finalize
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
index 869082f8231d..53362277dae8 100644
--- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.forward_context import get_forward_context, is_forward_context_available
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -13,14 +15,33 @@
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.deep_gemm import (
+    DeepGemmQuantScaleFMT,
     fp8_m_grouped_gemm_nt_masked,
     get_mk_alignment_for_contiguous_layout,
     is_deep_gemm_e8m0_used,
 )
+from vllm.utils.math_utils import cdiv, round_up
 
 logger = init_logger(__name__)
 
 
+def scales_shape_stride_dtype(
+    E: int, T: int, G: int, quant_scale_fmt: DeepGemmQuantScaleFMT
+) -> tuple[tuple[int, ...], tuple[int, ...], torch.dtype]:
+    shape = (E, T, G)
+    strides = (T * G, 1, T)
+    if quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32,
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+    ]:
+        return shape, strides, torch.float32
+
+    assert quant_scale_fmt == DeepGemmQuantScaleFMT.UE8M0
+    shape = (E, T, cdiv(G, 4))
+    strides = (T * cdiv(G, 4), 1, T)
+    return shape, strides, torch.int32
+
+
 @triton.jit
 def _silu_mul_fp8_quant_deep_gemm(
     # Pointers ------------------------------------------------------------
@@ -49,7 +70,7 @@ def _silu_mul_fp8_quant_deep_gemm(
     eps: tl.constexpr,
     fp8_min: tl.constexpr,
     fp8_max: tl.constexpr,
-    use_ue8m0: tl.constexpr,
+    ceil_ue8m0: tl.constexpr,
     # Meta ---------------------------------------------------------------
     BLOCK: tl.constexpr,
     NUM_STAGES: tl.constexpr,
@@ -86,7 +107,7 @@ def _silu_mul_fp8_quant_deep_gemm(
         y = gate * up
 
         y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
-        if use_ue8m0:
+        if ceil_ue8m0:
             y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
 
         y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
@@ -100,7 +121,7 @@ def persistent_masked_m_silu_mul_quant(
     tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
     num_parallel_tokens=16,
     group_size: int = 128,
-    use_ue8m0: bool | None = None,
+    quant_scale_fmt: DeepGemmQuantScaleFMT = DeepGemmQuantScaleFMT.FLOAT32,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
     y has shape (E, T, 2*H). The first half of the last dimension is
@@ -137,7 +158,13 @@ def persistent_masked_m_silu_mul_quant(
 
     Returns `(y_q, y_s)` where
     * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
-    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+    * `y_s` depends on quant_scale_fmt,
+      - quant_scale_fmt == FLOAT32,
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0,
+         `y_s`: Int32 tensor, shape (E, T, H // group_size // 4), strides (T*G, 1, T)
+      - quant_scale_fmt == E8M0_FLOAT32_SPARSE
+         `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
     Let NUM_WARPS be the number of warps in a single thread block and
     `GROUP_SIZE = 128` be the size of the quantization group.
     """
@@ -155,17 +182,18 @@ def persistent_masked_m_silu_mul_quant(
     fp8_dtype = torch.float8_e4m3fn
     y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
 
-    stride_ys_e = T * G
-    stride_ys_t = 1
-    stride_ys_g = T
+    ys_shape, ys_strides, ys_dtype = scales_shape_stride_dtype(E, T, G, quant_scale_fmt)
     y_s = torch.empty_strided(
-        (E, T, G),
-        (stride_ys_e, stride_ys_t, stride_ys_g),
-        dtype=torch.float32,
+        ys_shape,
+        ys_strides,
+        dtype=ys_dtype,
         device=y.device,
     )
 
-    use_ue8m0 = use_ue8m0 if use_ue8m0 is not None else is_deep_gemm_e8m0_used()
+    ceil_ue8m0 = quant_scale_fmt in [
+        DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0,
+        DeepGemmQuantScaleFMT.UE8M0,
+    ]
 
     cuda_arch = current_platform.get_device_capability(
         device_id=y.device.index
@@ -173,7 +201,7 @@ def persistent_masked_m_silu_mul_quant(
 
     if cuda_arch >= 80:
         torch.ops._C.persistent_masked_m_silu_mul_quant(
-            y, tokens_per_expert, y_q, y_s, use_ue8m0
+            y, tokens_per_expert, y_q, y_s, ceil_ue8m0
         )
     else:
         stride_cnt_e = tokens_per_expert.stride()[0]
@@ -189,6 +217,10 @@ def persistent_masked_m_silu_mul_quant(
         fp8_max = f_info.max
         fp8_min = f_info.min
         eps: float = 1e-10
+        assert y_s.dtype == torch.float32, (
+            "_silu_mul_fp8_quant_deep_gemm does"
+            "not support {y_s.dtype} scales. Only torch.float32 supported."
+        )
         _silu_mul_fp8_quant_deep_gemm[grid](
             y,
             y_q,
@@ -202,14 +234,14 @@ def persistent_masked_m_silu_mul_quant(
             stride_yq_e,
             stride_yq_t,
             stride_yq_h,
-            stride_ys_e,
-            stride_ys_t,
-            stride_ys_g,
+            ys_strides[0],
+            ys_strides[1],
+            ys_strides[2],
             stride_cnt_e,
             eps,
             fp8_min,
             fp8_max,
-            is_deep_gemm_e8m0_used(),
+            ceil_ue8m0,
             BLOCK=group_size,
             NUM_STAGES=4,
             num_warps=1,
@@ -255,7 +287,7 @@ def supports_packed_ue8m0_act_scales(self) -> bool:
         """
         DeepGemm supports packed ue8m0 activation scales format in devices == sm100
         """
-        return current_platform.is_device_capability(100)
+        return is_deep_gemm_e8m0_used() and current_platform.is_device_capability(100)
 
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Let PrepareAndFinalize::finalize() decide the impl.
@@ -282,6 +314,33 @@ def workspace_shapes(
         output = (num_experts, max_num_tokens * num_dispatchers, K)
         return (workspace13, workspace2, output)
 
+    def estimate_expected_m(
+        self, global_num_experts: int, max_tokens_per_expert: int, topk: int
+    ) -> int:
+        dp_meta = (
+            get_forward_context().dp_metadata
+            if is_forward_context_available()
+            else None
+        )
+        if dp_meta is None:
+            logger.warning_once(
+                "DPMetadata unavailable. Defaulting expected_m to "
+                f"{max_tokens_per_expert}.",
+                scope="local",
+            )
+            return max_tokens_per_expert
+
+        total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item()
+        total_num_tokens_replicated = total_num_tokens * topk
+
+        # Assume even load balancing
+        assert global_num_experts != 0
+        estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16)
+        # clamp estimate
+        estimate = max(estimate, 16)
+        estimate = min(max_tokens_per_expert, estimate)
+        return estimate
+
     def apply(
         self,
         output: torch.Tensor,
@@ -317,10 +376,12 @@ def apply(
 
         workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))
 
-        # (from deepgemm docs) : A value hint (which is a value on CPU)
-        # for the M expectation of each batch, correctly setting this value
-        # may lead to better performance.
-        expected_m = max_num_tokens
+        expected_m = self.estimate_expected_m(
+            global_num_experts=global_num_experts,
+            max_tokens_per_expert=max_num_tokens,
+            topk=topk_ids.size(-1),
+        )
+
         fp8_m_grouped_gemm_nt_masked(
             (a1q, a1q_scale),
             (w1, self.w1_scale),
@@ -329,10 +390,17 @@ def apply(
             expected_m,
         )
 
+        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
         a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
-            workspace1, expert_num_tokens
+            workspace1,
+            expert_num_tokens,
+            quant_scale_fmt=quant_scale_fmt,
         )
 
         fp8_m_grouped_gemm_nt_masked(
-            (a2q, a2q_scale), (w2, self.w2_scale), output, expert_num_tokens, expected_m
+            (a2q, a2q_scale),
+            (w2, self.w2_scale),
+            output,
+            expert_num_tokens,
+            expected_m,
         )
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index a7bd64b1c65e..1826fafa8c4f 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -8,7 +8,11 @@
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig
-from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
+from vllm.distributed import (
+    get_dp_group,
+    get_pcp_group,
+    get_tensor_model_parallel_rank,
+)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.ocp_mx_utils import (
     OCP_MX_DTYPES,
@@ -24,10 +28,11 @@
 if has_triton_kernels():
     try:
         from triton_kernels.matmul_ogs import PrecisionConfig
-    except ImportError:
+    except (ImportError, AttributeError) as e:
         logger.error(
             "Failed to import Triton kernels. Please make sure your triton "
-            "version is compatible."
+            "version is compatible. Error: %s",
+            e,
         )
 
 
@@ -684,9 +689,11 @@ def biased_moe_quant_config(
 @dataclass
 class FusedMoEParallelConfig:
     tp_size: int
+    pcp_size: int
     dp_size: int
     ep_size: int
     tp_rank: int
+    pcp_rank: int
     dp_rank: int
     ep_rank: int
 
@@ -713,19 +720,22 @@ def use_deepep_ll_kernels(self):
         return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency"
 
     @staticmethod
-    def flatten_tp_across_dp(
-        tp_size: int, dp_size: int, dp_rank: int
+    def flatten_tp_across_dp_and_pcp(
+        tp_size: int, dp_size: int, dp_rank: int, pcp_size: int, pcp_rank: int
     ) -> tuple[int, int]:
         tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank()
-        # There are actually dp_size * tp_size devices. Update tp_size
-        # and tp_rank so we shard across all devices.
-        flatten_tp_size = dp_size * tp_size
-        flatten_tp_rank = dp_rank * tp_size + tp_rank
+        # There are actually dp_size * pcp_size * tp_size devices.
+        # Update tp_size and tp_rank so we shard across all devices.
+        flatten_tp_size = dp_size * pcp_size * tp_size
+        flatten_tp_rank = dp_rank * pcp_size * tp_size + pcp_rank * tp_size + tp_rank
         return flatten_tp_size, flatten_tp_rank
 
     @staticmethod
     def make(
-        tp_size_: int, dp_size_: int, vllm_parallel_config: ParallelConfig
+        tp_size_: int,
+        pcp_size_: int,
+        dp_size_: int,
+        vllm_parallel_config: ParallelConfig,
     ) -> "FusedMoEParallelConfig":
         """
         Determine MoE parallel configuration. Based on the input `tp_size_`,
@@ -734,19 +744,22 @@ def make(
 
         Args:
             tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
+            pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor.
             dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
             vllm_parallel_config (ParallelConfig): vLLM's parallel config
                 object which contains the `enable_expert_parallel` flag.
 
         Examples:
             When there is no parallelism requested,
-            i.e. `tp_size_` = `dp_size_` = 1, we simply return the sizes
+            i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes
             unaltered and the ranks set to 0.
 
-            Expert Parallelism is considered only when either `dp_size_` or
+            Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or
             `tp_size_` is non trivial.
 
-            When TP = 2, DP = 1 and EP = False, the configuration on different
+            Note that PCP serves the same function as DP here.
+
+            When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different
             devices:
 
             - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
@@ -754,7 +767,7 @@ def make(
             - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
             - Comment : Tensors are sharded across 2 devices.
 
-            When TP = 1, DP = 2 and EP = False, the configuration on different
+            When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different
                 devices:
 
             - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
@@ -762,7 +775,7 @@ def make(
             - Comment: There are 2 engine instances and the tensors are sharded
                 across 2 decvices.
 
-            When TP = 2, DP = 2 and EP = False, the configuration on different
+            When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different
                 devices:
 
             - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
@@ -772,14 +785,14 @@ def make(
             - Comment: There are 2 engine instances and the tensors are sharded
                 across 4 devices.
 
-            When, TP = 2, DP = 1 and EP = True, the configuration on different
+            When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
             - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
             - Comment: The experts are split between the 2 devices.
 
-            When, TP = 1, DP = 2 and EP = True, the configuration on different
+            When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
@@ -787,7 +800,7 @@ def make(
             - Comment: There are 2 engine instances and the experts are split
                 between the 2 devices.
 
-            When TP = 2, DP = 2 and EP = True, the configuration on different
+            When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different
                 devices:
 
             - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
@@ -798,18 +811,25 @@ def make(
                 between the 4 devices.
         """
 
-        use_ep = dp_size_ * tp_size_ > 1 and vllm_parallel_config.enable_expert_parallel
+        use_ep = (
+            dp_size_ * pcp_size_ * tp_size_ > 1
+            and vllm_parallel_config.enable_expert_parallel
+        )
 
         dp_size = dp_size_
         dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
-            tp_size_, dp_size_, dp_rank
+        pcp_size = pcp_size_
+        pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
+            tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank
         )
 
         if not use_ep:
             return FusedMoEParallelConfig(
                 tp_size=tp_size,
                 tp_rank=tp_rank,
+                pcp_size=pcp_size,
+                pcp_rank=pcp_rank,
                 dp_size=dp_size,
                 dp_rank=dp_rank,
                 ep_size=1,
@@ -826,6 +846,8 @@ def make(
         return FusedMoEParallelConfig(
             tp_size=1,
             tp_rank=0,
+            pcp_size=pcp_size,
+            pcp_rank=pcp_rank,
             dp_size=dp_size,
             dp_rank=dp_rank,
             ep_size=ep_size,
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..555d17364452
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=1024,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json
@@ -0,0 +1,164 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 16,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 4,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 4,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "waves_per_eu": 0
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..54fe5374cb95
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json
new file mode 100644
index 000000000000..8b78f87e7f73
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=20,N=1536,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Server_Edition,dtype=fp8_w8a8.json
@@ -0,0 +1,147 @@
+{
+    "triton_version": "3.5.0",
+    "1": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
index 6fcf408755f5..532c16e89926 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -1,11 +1,11 @@
 {
     "1": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "2": {
         "BLOCK_SIZE_M": 16,
@@ -13,82 +13,82 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "4": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "16": {
         "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
     "24": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "32": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "48": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "64": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
     "128": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 3
     },
     "256": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
@@ -96,10 +96,10 @@
         "num_stages": 3
     },
     "512": {
-        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_M": 16,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     },
@@ -109,7 +109,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1536": {
         "BLOCK_SIZE_M": 64,
@@ -117,21 +117,21 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "2048": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "3072": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 16,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 3
     }
diff --git a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
index 23ace3408562..659a2d4ee5b3 100644
--- a/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cpu_fused_moe.py
@@ -6,23 +6,7 @@
 from torch.nn import functional as F
 
 from vllm import _custom_ops as ops
-from vllm import envs
-
-
-def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    return F.silu(x[..., :d]) * x[..., d:]
-
-
-def swigluoai_and_mul(
-    x: torch.Tensor, alpha: float = 1.702, limit: float = 7.0
-) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    gate, up = x[..., :d], x[..., d:]
-    gate = gate.clamp(max=limit)
-    up = up.clamp(min=-limit, max=limit)
-    glu = gate * torch.sigmoid(alpha * gate)
-    return (up + 1) * glu
+from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
 
 
 def grouped_topk(
@@ -130,54 +114,6 @@ def select_experts(
         )
 
 
-class IPEXFusedMOE:
-    def __init__(self, layer: torch.nn.Module) -> None:
-        import intel_extension_for_pytorch as ipex
-
-        layer.ipex_fusion = ipex.llm.modules.GatedMLPMOE(
-            layer.w13_weight,
-            layer.w2_weight,
-            use_prepack=envs.VLLM_CPU_MOE_PREPACK,
-        )
-
-    def __call__(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        global_num_experts: int = -1,
-        expert_map: torch.Tensor | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-    ) -> torch.Tensor:
-        assert activation == "silu", f"{activation} is not supported."
-        assert not apply_router_weight_on_input
-        assert routed_scaling_factor == 1.0, (
-            f"routed_scaling_factor {routed_scaling_factor} is not supported."
-        )
-        return layer.ipex_fusion(
-            x,
-            use_grouped_topk,
-            top_k,
-            router_logits,
-            renormalize,
-            topk_group,
-            num_expert_group,
-            custom_routing_function,
-            scoring_func,
-            e_score_correction_bias,
-        )
-
-
 class SGLFusedMOE:
     def __init__(self, layer: torch.nn.Module) -> None:
         pass
@@ -276,6 +212,11 @@ def __init__(self, layer: torch.nn.Module) -> None:
             layer.w13_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
             layer.w2_weight = torch.nn.Parameter(torch.empty(0), requires_grad=False)
 
+        self.act_to_impl = {
+            "silu": SiluAndMul(),
+            "swigluoai": SwigluOAIAndMul(),
+        }
+
     def __call__(
         self,
         layer: torch.nn.Module,
@@ -295,7 +236,7 @@ def __call__(
         apply_router_weight_on_input: bool = False,
         activation: str = "silu",
     ) -> torch.Tensor:
-        assert activation in {"silu", "swigluoai"}, f"{activation} is not supported."
+        assert activation in self.act_to_impl, f"{activation} is not supported."
         assert not apply_router_weight_on_input
         topk_weights, topk_ids = select_experts(
             hidden_states=x,
@@ -332,10 +273,7 @@ def __call__(
             tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
 
             gate_up = layer.gate_up_linear[i](tokens_for_this_expert)
-            if activation == "swigluoai":
-                gate_up = swigluoai_and_mul(gate_up)
-            else:
-                gate_up = silu_and_mul(gate_up)
+            gate_up = self.act_to_impl[activation].forward_native(gate_up)
             expert_out = layer.down_linear[i](gate_up)
             outputs.append(expert_out)
             start_idx = end_idx
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index 06c9df317f7c..06e4a61133bd 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -6,6 +6,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
@@ -27,6 +28,8 @@
 DEEPEP_QUANT_BLOCK_SIZE = 128
 DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
 
+logger = init_logger(__name__)
+
 
 def dequant_fp8(
     expert_x_fp8: torch.Tensor, expert_x_scales: torch.Tensor
@@ -85,6 +88,9 @@ def __init__(
         max_tokens_per_rank: int,
         num_dispatchers: int,
         use_fp8_dispatch: bool = False,
+        global_to_physical: torch.Tensor | None = None,
+        physical_to_global: torch.Tensor | None = None,
+        local_expert_global_ids: torch.Tensor | None = None,
     ):
         super().__init__()
 
@@ -97,6 +103,17 @@ def __init__(
         self.handles: list[tuple | None] = [None, None]
         self.num_dispatchers_ = num_dispatchers
 
+        topk_indices_dtype = self.topk_indices_dtype()
+
+        def _maybe_cast(tensor: torch.Tensor | None) -> torch.Tensor | None:
+            if tensor is None or topk_indices_dtype is None:
+                return tensor
+            return tensor.to(dtype=topk_indices_dtype)
+
+        self.global_to_physical = _maybe_cast(global_to_physical)
+        self.physical_to_global = _maybe_cast(physical_to_global)
+        self.local_expert_global_ids = _maybe_cast(local_expert_global_ids)
+
         # We don't have enough information to determine if we should dispatch
         # activation scales in a packed ue8m0 format during object construction
         # time. This setting is handled by post_init_setup.
@@ -136,6 +153,16 @@ def max_num_tokens_per_rank(self) -> int | None:
     def topk_indices_dtype(self) -> torch.dtype | None:
         return torch.int64
 
+    def _map_global_to_physical_ids(self, topk_ids: torch.Tensor) -> torch.Tensor:
+        if self.global_to_physical is None:
+            return topk_ids
+        return self.global_to_physical[topk_ids]
+
+    def _map_local_to_global_ids(self, expert_topk_ids: torch.Tensor) -> torch.Tensor:
+        if self.local_expert_global_ids is None:
+            return expert_topk_ids
+        return self.local_expert_global_ids[expert_topk_ids]
+
     def _do_quant(
         self,
         x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
@@ -157,22 +184,47 @@ def _do_quant(
             x_fp8, x_scales = x
             x = dequant_fp8(x_fp8, x_scales).to(dtype=a1_dtype)
 
-        assert isinstance(x, torch.Tensor)
-
-        num_experts, max_tokens, hidden_dim = x.size()
+        assert isinstance(x, (torch.Tensor, tuple))
+        q_dtype = quant_config.quant_dtype
 
-        # TODO (varun): Optimization - Use a batched version of quant
-        x = x.view((-1, hidden_dim))
-        x, x_scales = moe_kernel_quantize_input(
-            x,
-            quant_config.a1_scale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-        )
-        x = x.view((num_experts, -1, hidden_dim))
+        if q_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH:
+            logger.info_once(
+                "Since VLLM_DEEPEPLL_NVFP4_DISPATCH==1, make sure "
+                "using the hybrid-ep branch of DeepEP"
+                "(https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep)"
+            )
+            assert isinstance(x, tuple)
+            x_scales = x[1]
+            x = x[0].permute(2, 0, 1)
+            num_experts, max_tokens, hidden_dim_by_2 = x.shape
+            hidden_dim = hidden_dim_by_2 * 2
+            assert envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm"
+            logger.info_once(
+                "Quantization is fused with DeepEP nvfp4 dispatch for "
+                "FlashInfer CUTEDSL as VLLM_DEEPEPLL_NVFP4_DISPATCH==1"
+            )
+        else:
+            if q_dtype == "nvfp4":
+                q_dtype = None
+                logger.info_once(
+                    "Using DeepEP bfloat16 dispatch for FlashInfer CUTEDSL as "
+                    "VLLM_DEEPEPLL_NVFP4_DISPATCH==0"
+                )
+            assert isinstance(x, torch.Tensor)
+            num_experts, max_tokens, hidden_dim = x.size()
+
+            # TODO (varun): Optimization - Use a batched version of quant
+            x = x.view((-1, hidden_dim))
+            x, x_scales = moe_kernel_quantize_input(
+                x,
+                quant_config.a1_scale,
+                q_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+            )
+            x = x.view((num_experts, -1, hidden_dim))
 
-        if quant_config.quant_dtype is not None:
+        if q_dtype is not None and q_dtype != "nvfp4":
             assert x_scales is not None
             x_scales = normalize_batched_scales_shape(x_scales, num_experts)
 
@@ -204,18 +256,28 @@ def prepare_async(
                 "DeepEP kernels quantize the inputs in blocks of shape 128"
             )
 
+        use_nvfp4 = False
+        nvfp4_dispatch = (
+            quant_config.quant_dtype == "nvfp4" and envs.VLLM_DEEPEPLL_NVFP4_DISPATCH
+        )
+        if nvfp4_dispatch:
+            use_nvfp4 = True
+        qc_a1_gscale_or_scale = (
+            quant_config.a1_gscale if nvfp4_dispatch else quant_config.a1_scale
+        )
         has_per_token_scales = (
-            quant_config.a1_scale.numel() != 1
-            if quant_config.a1_scale is not None
+            qc_a1_gscale_or_scale.numel() != 1
+            if qc_a1_gscale_or_scale is not None
             else (
                 quant_config.a2_scale.numel() != 1
                 if quant_config.a2_scale is not None
                 else False
             )
         )
-        assert not has_per_token_scales, (
-            "low_latency kernels doesn't support dispatching per-token scales"
-        )
+        if not use_nvfp4:
+            assert not has_per_token_scales, (
+                "low_latency kernels doesn't support dispatching per-token scales"
+            )
 
         if apply_router_weight_on_input:
             topk = topk_ids.size(1)
@@ -226,15 +288,19 @@ def prepare_async(
             a1 = a1 * topk_weights.to(a1.dtype)
 
         # Dispatch
+        dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids)
         expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch(
             a1,
-            topk_ids,
+            dispatch_topk_ids,
             self.max_tokens_per_rank,
             num_experts,
             use_fp8=self.use_fp8_dispatch,
-            # round_scale needs to be set to dispatch in ue8m0
-            round_scale=self.use_ue8m0_dispatch,
-            use_ue8m0=self.use_ue8m0_dispatch,
+            **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+            **(
+                dict(x_global_scale=qc_a1_gscale_or_scale)
+                if qc_a1_gscale_or_scale is not None
+                else dict()
+            ),
             async_finish=False,
             return_recv_hook=True,
         )
@@ -313,11 +379,12 @@ def _finalize(
             # weights have already been applied.
             combine_topk_weights = torch.ones_like(topk_weights)
 
+        combine_topk_ids = self._map_global_to_physical_ids(topk_ids)
         # TODO (varun) : Enable zero copy mode
         dbo_maybe_run_recv_hook()
         _, _, recv_hook = self.buffer.low_latency_combine(
             fused_expert_output,
-            topk_ids,
+            combine_topk_ids,
             combine_topk_weights,
             handle,
             async_finish=False,
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
new file mode 100644
index 000000000000..6e0b57156cb3
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
@@ -0,0 +1,376 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceDelegate,
+)
+from vllm.utils.flashinfer import (
+    flashinfer_cutedsl_grouped_gemm_nt_masked,
+    has_flashinfer_cutedsl_grouped_gemm_nt_masked,
+    scaled_fp4_grouped_quantize,
+    silu_and_mul_scaled_nvfp4_experts_quantize,
+)
+
+logger = init_logger(__name__)
+
+
+def is_valid_flashinfer_cutedsl_fused_moe(
+    hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor
+) -> bool:
+    """
+    Check if the given problem size is supported by the FlashInfer CuteDSL MoE
+    kernel.
+    """
+    if not has_flashinfer_cutedsl_grouped_gemm_nt_masked():
+        logger.debug_once(
+            "FlashInferCuteDSLExperts disabled: "
+            "flashinfer_cutedsl_fused_moe not available."
+        )
+        return False
+    # Data type checks
+    if (
+        w1.dtype != torch.uint8
+        or w2.dtype != torch.uint8
+        or hidden_states.dtype not in [torch.float32, torch.float16, torch.bfloat16]
+    ):
+        logger.debug_once(
+            "FlashInferCuteDSLExperts disabled: w1/w2 must be torch.uint8 "
+            f"(got w1={w1.dtype}, w2={w2.dtype}), hidden_states must be "
+            f"float32, float16, or bfloat16 (got {hidden_states.dtype})."
+        )
+        return False
+    return True
+
+
+class FlashInferCuteDSLExperts(mk.FusedMoEPermuteExpertsUnpermute):
+    def __init__(
+        self,
+        out_dtype: torch.dtype,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(quant_config)
+        assert quant_config.quant_dtype == "nvfp4", (
+            "Only nvfp4 quantization are currently supported."
+        )
+        self.out_dtype = out_dtype
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.BatchedExperts,
+            mk.FusedMoEActivationFormat.BatchedExperts,
+        )
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    def supports_chunking(self) -> bool:
+        # This refers to TP chunking; DP chunking is handled separately.
+        # TODO(shuw@nvidia.com): Set to False to be consistent with
+        # batched_deep_gemm_moe
+        return False
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        # Let PrepareAndFinalize::finalize() decide the impl.
+        return TopKWeightAndReduceDelegate()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # We use global_num_experts due to how moe_align_block_size handles
+        # expert_maps.
+        """
+        Compute the shapes for the temporary and final outputs of the two gemms
+        and activation in the fused expert function.  Since the gemms are
+        independent, the workspace for the first gemm can be shared with the
+        workspace for the last gemm.
+
+        Returns a tuple of:
+        - workspace13 shape tuple: must be large enough to hold the
+          result of either expert gemm.
+        - workspace2 shape tuple: must be large enough to hold the
+          result of the activation function.
+        - output shape tuple: must be exact size of the final gemm output.
+        - Workspace type: The dtype to use for the workspace tensors.
+        - Note: in order for activation chunking to work, the first dimension
+          of each tuple must be the number of tokens.
+        """
+        K_dim = K * 2 if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else K
+        output_shape = (local_num_experts, M, K_dim)
+        workspace2 = (local_num_experts, M, N)
+        workspace1 = output_shape
+        return (workspace1, workspace2, output_shape)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,  # Not used
+        workspace13: torch.Tensor | None,
+        workspace2: torch.Tensor | None,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool | None,
+    ):
+        assert self.quant_dtype == "nvfp4", (
+            "Only nvfp4 quantization are currently supported."
+        )
+        # Ensure w1_scale and w2_scale are not None before calling view
+        assert self.w1_scale is not None and self.w2_scale is not None, (
+            "w1_scale and w2_scale must not be None for FlashInferExperts"
+        )
+        assert expert_tokens_meta is not None
+        expert_num_tokens = expert_tokens_meta.expert_num_tokens
+        assert hidden_states.ndim == 3
+        assert self.w1_scale.ndim == 3
+        assert self.w2_scale.ndim == 3
+
+        input_global_scale = (
+            None if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH else self.a1_gscale
+        )
+        flashinfer_hidden_states = (
+            (hidden_states, a1q_scale)
+            if envs.VLLM_DEEPEPLL_NVFP4_DISPATCH
+            else hidden_states
+        )
+        flashinfer_cutedsl_moe_masked(
+            hidden_states=flashinfer_hidden_states,
+            input_global_scale=input_global_scale,
+            w1=w1,
+            w1_blockscale=self.w1_scale,
+            w1_alpha=self.g1_alphas,
+            w2=w2,
+            a2_global_scale=self.a2_gscale,
+            w2_blockscale=self.w2_scale,
+            w2_alpha=self.g2_alphas,
+            masked_m=expert_num_tokens,
+            workspace=workspace2,
+            out=output,
+        )
+
+
+def get_cute_dtype(input: torch.Tensor) -> str:
+    if input.dtype == torch.bfloat16:
+        return "bfloat16"
+    elif input.dtype == torch.float16:
+        return "float16"
+    elif input.dtype == torch.float32:
+        return "float32"
+    else:
+        raise ValueError(f"Unsupported cute dtype {input.dtype}")
+
+
+def flashinfer_cutedsl_moe_masked(
+    hidden_states: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
+    input_global_scale: torch.Tensor,
+    w1: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alpha,
+    w2: torch.Tensor,
+    a2_global_scale: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alpha,
+    masked_m: torch.Tensor,
+    workspace: torch.Tensor,
+    out: torch.Tensor,
+):
+    """
+    Perform masked Mixture-of-Experts computation with FlashInfer's CuteDSL
+    kernels.
+
+    Args:
+        hidden_states: Either of the following case
+            * torch.Tensor: [num_experts, m, k], bf16
+            * tuple[torch.Tensor, torch.Tensor]: [num_experts, m, k // 2],
+                  uint8, [num_experts, m, k // 16], float8_e4m3fn
+        input_global_scale (torch.Tensor): (l,)
+        w1 (torch.Tensor): fp4 weights, [l, 2 * n, k // 2], uint8
+        w1_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w1_alpha (torch.Tensor): (l,)
+        w2 (torch.Tensor): fp4 weights, [l, k, n // 2], uint8
+        a2_global_scale (torch.Tensor): (l,)
+        w2_blockscale (torch.Tensor): blockscale factors, e4m3,
+        w2_alpha (torch.Tensor): (l,)
+        masked_m (torch.Tensor): Masked dimension indices
+        workspace (torch.Tensor): For gateup_output
+
+    Notes:
+        - Assumes max(masked_m) <= m.
+    """
+
+    # === Assertions on dtypes ===
+    assert w1.dtype == torch.uint8, f"w1 must be uint8, got {w1.dtype}"
+    assert w1_blockscale.dtype == torch.float8_e4m3fn, (
+        f"w1_blockscale must be float8_e4m3fn, got {w1_blockscale.dtype}"
+    )
+    assert w1_alpha.dtype == torch.float32, (
+        f"w1_alpha must be float32, got {w1_alpha.dtype}"
+    )
+    assert w2.dtype == torch.uint8, f"w2 must be uint8, got {w2.dtype}"
+    assert a2_global_scale.dtype == torch.float32, (
+        f"a2_global_scale must be float32, got {a2_global_scale.dtype}"
+    )
+    assert w2_blockscale.dtype == torch.float8_e4m3fn, (
+        f"w2_blockscale must be float8_e4m3fn, got {w2_blockscale.dtype}"
+    )
+    assert w2_alpha.dtype == torch.float32, (
+        f"w2_alpha must be float32, got {w2_alpha.dtype}"
+    )
+
+    # === Assertions on shapes ===
+    n = w2.shape[-1] * 2  # intermediate dimension
+    if isinstance(hidden_states, tuple):
+        assert input_global_scale is None, (
+            "input_global_scale is needed when input needs quant"
+        )
+
+        aq = hidden_states[0].view(torch.uint8)
+        aq_sf = hidden_states[1].view(torch.float8_e4m3fn)
+        # m, k_by_2, num_experts = aq.shape
+        num_experts, m, k_by_2 = aq.shape
+        k = k_by_2 * 2
+        aq = aq.permute(1, 2, 0)
+    else:
+        num_experts, m, k = hidden_states.shape
+
+        assert input_global_scale.dtype == torch.float32, (
+            f"input_global_scale must be float32, got {input_global_scale.dtype}"
+        )
+        assert input_global_scale.shape == (num_experts,), (
+            f"input_global_scale must be (l,), got {input_global_scale.shape}"
+        )
+
+        aq, aq_sf = scaled_fp4_grouped_quantize(
+            hidden_states,
+            masked_m,
+            input_global_scale,
+        )
+
+    assert w1.shape[-2] == 2 * n, f"w1 last-2 dim must be 2*n, got {w1.shape}"
+    assert w1.shape[-1] * 2 == k, (
+        f"w1 last dim * 2 must equal k, got {w1.shape[-1]} vs k={k}"
+    )
+    assert w2.shape[-2:] == (
+        k,
+        n // 2,
+    ), f"w2 shape mismatch, got {w2.shape[-2:]}, expected {(k, n // 2)}"
+
+    assert w1_alpha.shape == (num_experts,), (
+        f"w1_alpha must be (l,), got {w1_alpha.shape}"
+    )
+    assert a2_global_scale.shape == (num_experts,), (
+        f"a2_global_scale must be (l,), got {a2_global_scale.shape}"
+    )
+    assert w2_alpha.shape == (num_experts,), (
+        f"w2_alpha must be (l,), got {w2_alpha.shape}"
+    )
+
+    workspace = workspace.permute(1, 2, 0)  # requirement of kernel
+    sf_vec_size = 16
+    assert aq_sf.dtype == torch.float8_e4m3fn
+    assert aq.dtype == torch.uint8
+    ab_dtype = "float4_e2m1fn"
+    sf_dtype = "float8_e4m3fn"
+
+    if isinstance(hidden_states, tuple):
+        c_dtype = "bfloat16"
+    else:
+        c_dtype = get_cute_dtype(hidden_states)
+
+    # Gemm1
+    flashinfer_cutedsl_grouped_gemm_nt_masked(
+        (aq, aq_sf),
+        (w1.permute(1, 2, 0), w1_blockscale),
+        workspace,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w1_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w1_alpha),
+    )  # in logical [m, n, l]
+
+    # SILU and quantization
+    diq, diq_sf = silu_and_mul_scaled_nvfp4_experts_quantize(
+        workspace.permute(2, 0, 1),
+        masked_m,
+        a2_global_scale,
+    )
+
+    # Gemm2
+    out = out.permute(1, 2, 0)  # requirement of kernel
+    flashinfer_cutedsl_grouped_gemm_nt_masked(
+        (diq, diq_sf),
+        (w2.permute(1, 2, 0), w2_blockscale),
+        out,
+        masked_m,
+        ab_dtype=ab_dtype,
+        sf_dtype=sf_dtype,
+        c_dtype=c_dtype,
+        sf_vec_size=sf_vec_size,
+        alpha=w2_alpha.view(1, 1, num_experts),
+        alpha_dtype=get_cute_dtype(w2_alpha),
+    )  # in logical [m, k, l]
+    out = out.permute(2, 0, 1)
+
+
+def flashinfer_cutedsl_moe_fp4(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    quant_config: FusedMoEQuantConfig,
+    inplace: bool = False,
+    activation: str = "silu",
+    global_num_experts: int = -1,
+    expert_map: torch.Tensor | None = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
+        create_flashinfer_prepare_finalize,
+    )
+
+    fused_experts = mk.FusedMoEModularKernel(
+        create_flashinfer_prepare_finalize(use_dp=False),  # could be swapped later
+        FlashInferCuteDSLExperts(
+            out_dtype=hidden_states.dtype,
+            quant_config=quant_config,
+        ),
+    )
+
+    return fused_experts(
+        hidden_states=hidden_states,
+        w1=w1,
+        w2=w2,
+        topk_weights=topk_weights,
+        topk_ids=topk_ids,
+        inplace=inplace,
+        activation=activation,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+    )
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 85ce77fb1f7f..f864634c6617 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -57,6 +57,7 @@ def __init__(
         tp_rank: int = 0,
         tp_size: int = 1,
         use_dp: bool = False,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__(quant_config)
         assert quant_config.quant_dtype in ("nvfp4", torch.float8_e4m3fn, None), (
@@ -69,6 +70,10 @@ def __init__(
         self.tp_size = tp_size
         self.out_dtype = out_dtype
         self.use_dp = use_dp
+        # Enables DeepSeek-style FP8 block-scale path:
+        # - pass per-block weight scales to the kernel
+        # - skip input activation quantization (kernel applies scaling)
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_formats(
@@ -143,11 +148,22 @@ def apply(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
         apply_router_weight_on_input: bool | None,
     ):
-        assert activation == "silu", (
-            "Only activation silu is supported in FlashInferExperts"
+        from flashinfer.fused_moe.core import ActivationType
+
+        activation_str_to_value_map = {
+            "silu": ActivationType.Swiglu,  # This is the default
+            "relu2_no_mul": ActivationType.Relu2,
+        }
+        assert activation in activation_str_to_value_map, (
+            f"{activation=} missing from {activation_str_to_value_map.keys()=}"
         )
 
-        if self.quant_dtype == torch.float8_e4m3fn:
+        # Select quantization metadata based on FP8 format/path
+        if (
+            self.quant_dtype == torch.float8_e4m3fn
+            and not self.use_deepseek_fp8_block_scale
+        ):
+            # FP8 per-tensor path: use global alphas/scales; do not pass input_sf
             quant_scales = [
                 self.g1_alphas,
                 self.a2_gscale,
@@ -176,6 +192,15 @@ def apply(
             # FlashInfer API requires weight to be long for nvfp4
             fc1_expert_weights = w1.view(torch.long)
             fc2_expert_weights = w2.view(torch.long)
+        elif self.use_deepseek_fp8_block_scale:
+            # FP8 block-scale path: provide block-scale weights, omit a1q_scale
+            quant_scales = [
+                self.w1_scale,
+                self.w2_scale,
+            ]
+            a1q_scale = None
+            fc1_expert_weights = w1
+            fc2_expert_weights = w2
         else:
             quant_scales = None
             a1q_scale = None
@@ -196,6 +221,9 @@ def apply(
             ep_size=self.ep_size,
             ep_rank=self.ep_rank,
             output=output,
+            activation_type=activation_str_to_value_map[activation],
+            # Informs FlashInfer to use the block-scale decoding path when True
+            use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
         )
 
 
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
index bc9aab5208d9..762890867e60 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
@@ -28,11 +28,15 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
         super().__init__()
         self.num_dispatchers_ = num_dispatchers
         self.use_dp = use_dp
         self.local_tokens = None
+        # Toggle for DeepSeek-style FP8 block-scale path where activations are
+        # not quantized here and weight block scales are consumed by the kernel.
+        self.use_deepseek_fp8_block_scale = use_deepseek_fp8_block_scale
 
     @property
     def activation_format(self) -> mk.FusedMoEActivationFormat:
@@ -73,8 +77,9 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
         self.alltoall_info = None
 
         # Initialize all2all_manager only for DP case
@@ -97,15 +102,19 @@ def prepare(
         )
 
         if not self.use_dp:
-            # Non-DP case: standard quantization
-            a1q, a1q_scale = moe_kernel_quantize_input(
-                a1,
-                quant_config.a1_gscale,
-                quant_config.quant_dtype,
-                quant_config.per_act_token_quant,
-                quant_config.block_shape,
-                is_fp4_scale_swizzled=not self.use_dp,
-            )
+            # Non-DP case: quantize activations unless using block-scale path
+            if not self.use_deepseek_fp8_block_scale:
+                a1q, a1q_scale = moe_kernel_quantize_input(
+                    a1,
+                    quant_config.a1_gscale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                    is_fp4_scale_swizzled=not self.use_dp,
+                )
+            else:
+                a1q = a1
+                a1q_scale = None
         else:
             # DP case: use FlashInfer AllToAll
             global_num_tokens_cpu = get_local_sizes()
@@ -122,6 +131,7 @@ def prepare(
                     top_k,
                     num_experts,
                     quant_config,
+                    use_deepseek_fp8_block_scale=self.use_deepseek_fp8_block_scale,
                 )
             )
 
@@ -154,8 +164,9 @@ def __init__(
         self,
         use_dp: bool,
         num_dispatchers: int = 1,
+        use_deepseek_fp8_block_scale: bool = False,
     ):
-        super().__init__(use_dp, num_dispatchers)
+        super().__init__(use_dp, num_dispatchers, use_deepseek_fp8_block_scale)
 
     def prepare(
         self,
@@ -173,22 +184,42 @@ def prepare(
         if not self.use_dp and quant_config.quant_dtype == "nvfp4":
             return a1, None, None, topk_ids, topk_weights
 
-        a1q, a1q_scale = moe_kernel_quantize_input(
-            a1,
-            quant_config.a1_gscale,
-            quant_config.quant_dtype,
-            quant_config.per_act_token_quant,
-            quant_config.block_shape,
-            is_fp4_scale_swizzled=not self.use_dp,
-        )
+        if not self.use_deepseek_fp8_block_scale:
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                quant_config.a1_gscale,
+                quant_config.quant_dtype,
+                quant_config.per_act_token_quant,
+                quant_config.block_shape,
+                is_fp4_scale_swizzled=not self.use_dp,
+            )
+        else:
+            # Block-scale path: pass activations through, omit per-token scales
+            a1q = a1
+            a1q_scale = None
 
         if self.use_dp:
-            topk_weights, topk_ids, a1q, a1q_scale = get_dp_group().all_gatherv(
-                [topk_weights, topk_ids, a1q, a1q_scale],
-                dim=0,
-                sizes=get_local_sizes(),
-            )
-        if quant_config.quant_dtype == "nvfp4":
+            # Build gather list conditionally - omit a1q_scale if None
+            # (block-scale path)
+            gather_list = [topk_weights, topk_ids, a1q]
+            if a1q_scale is not None:
+                gather_list.append(a1q_scale)
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q, a1q_scale = gathered
+            else:
+                gathered = get_dp_group().all_gatherv(
+                    gather_list,
+                    dim=0,
+                    sizes=get_local_sizes(),
+                )
+                topk_weights, topk_ids, a1q = gathered
+                a1q_scale = None
+
+        if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None:
             a1q_scale = nvfp4_block_scale_interleave(a1q_scale)
 
         return a1q, a1q_scale, None, topk_ids, topk_weights
@@ -221,6 +252,7 @@ def flashinfer_alltoall_dispatch(
     top_k: int,
     num_experts: int,
     quant_config: FusedMoEQuantConfig,
+    use_deepseek_fp8_block_scale: bool = False,
 ):
     from flashinfer.comm.trtllm_alltoall import MnnvlMoe
 
@@ -250,30 +282,42 @@ def flashinfer_alltoall_dispatch(
     )
     topk_weights = topk_weights.view(dtype=orig_topk_weights_dtype)
 
-    x, x_sf = moe_kernel_quantize_input(
-        x,
-        gs,
-        quant_config.quant_dtype,
-        quant_config.per_act_token_quant,
-        quant_config.block_shape,
-        is_fp4_scale_swizzled=False,  # delay swizzle to after comm
-    )
-    x = MnnvlMoe.mnnvl_moe_alltoallv(
-        x,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
-    )
+    if not use_deepseek_fp8_block_scale:
+        x, x_sf = moe_kernel_quantize_input(
+            x,
+            gs,
+            quant_config.quant_dtype,
+            quant_config.per_act_token_quant,
+            quant_config.block_shape,
+            is_fp4_scale_swizzled=False,  # delay swizzle to after comm
+        )
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
 
-    x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
-        x_sf,
-        alltoall_info,
-        all2all_manager.workspace_tensor,
-        ep_rank,
-        ep_size,
-    )
-    x_sf = nvfp4_block_scale_interleave(x_sf)
+        x_sf = MnnvlMoe.mnnvl_moe_alltoallv(
+            x_sf,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
+        if quant_config.quant_dtype == "nvfp4":
+            x_sf = nvfp4_block_scale_interleave(x_sf)
+    else:
+        # Block-scale path: pass activations through without quantization
+        x_sf = None
+        x = MnnvlMoe.mnnvl_moe_alltoallv(
+            x,
+            alltoall_info,
+            all2all_manager.workspace_tensor,
+            ep_rank,
+            ep_size,
+        )
     return alltoall_info, topk_ids, topk_weights, x, x_sf
 
 
@@ -304,6 +348,7 @@ def create_flashinfer_prepare_finalize(
     use_dp: bool,
     use_nvfp4: bool = False,
     enable_alltoallv: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> FlashInferCutlassMoEPrepareAndFinalize:
     """Factory function to create the appropriate FlashInfer implementation."""
     if use_nvfp4:
@@ -311,5 +356,7 @@ def create_flashinfer_prepare_finalize(
             return FlashInferAllToAllMoEPrepareAndFinalize(use_dp)
         else:
             return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
-    # Fp8 only supports AllGather
-    return FlashInferAllGatherMoEPrepareAndFinalize(use_dp)
+    # FP8 path currently supported via AllGather; optionally enable block-scale
+    return FlashInferAllGatherMoEPrepareAndFinalize(
+        use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 0b0f59f67318..9c377db72013 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     marlin_make_workspace_new,
     marlin_moe_intermediate_size,
-    maybe_warn_marlin_atomic_add,
+    marlin_quant_input,
 )
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -65,6 +65,8 @@ def _fused_marlin_moe(
     activation_func: Callable[
         [str, torch.Tensor, torch.Tensor], None
     ] = default_activation_func,
+    input_global_scale1: torch.Tensor | None = None,
+    input_global_scale2: torch.Tensor | None = None,
     global_scale1: torch.Tensor | None = None,
     global_scale2: torch.Tensor | None = None,
     g_idx1: torch.Tensor | None = None,
@@ -77,6 +79,7 @@ def _fused_marlin_moe(
     intermediate_cache13: torch.Tensor | None = None,
     intermediate_cache2: torch.Tensor | None = None,
     output: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     is_k_full: bool = True,
 ) -> torch.Tensor:
     assert hidden_states.ndim == 2
@@ -106,18 +109,22 @@ def _fused_marlin_moe(
 
     intermediate_cache2 = _resize_cache(intermediate_cache2, (M * num_topk, N))
 
-    maybe_warn_marlin_atomic_add(hidden_states.device, hidden_states.dtype)
-    use_atomic_add = (
-        hidden_states.dtype == torch.half
-        or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
-    )
+    a_scales1 = None
+    gate_up_input = hidden_states
+    if input_dtype == torch.int8:
+        gate_up_input, a_scales1 = marlin_quant_input(hidden_states, input_dtype)
+        if input_global_scale1 is not None:
+            a_scales1 = a_scales1 * input_global_scale1
+    elif input_dtype == torch.float8_e4m3fn:
+        gate_up_input, a_scales1 = marlin_quant_input(hidden_states, input_dtype)
 
     intermediate_cache1 = ops.moe_wna16_marlin_gemm(
-        hidden_states,
+        gate_up_input,
         intermediate_cache1,
         w1,
         bias1,
         w1_scale,
+        a_scales1,
         global_scale1,
         w1_zeros,
         g_idx1,
@@ -136,7 +143,7 @@ def _fused_marlin_moe(
         size_n=2 * N,
         size_k=K,
         is_k_full=is_k_full,
-        use_atomic_add=use_atomic_add,
+        use_atomic_add=False,
         use_fp32_reduce=True,
         is_zp_float=False,
     )
@@ -151,12 +158,25 @@ def _fused_marlin_moe(
     if expert_map is not None:
         output.zero_()
 
+    a_scales2 = None
+    if input_dtype == torch.int8:
+        intermediate_cache2, a_scales2 = marlin_quant_input(
+            intermediate_cache2, input_dtype
+        )
+        if input_global_scale2 is not None:
+            a_scales2 = a_scales2 * input_global_scale2
+    elif input_dtype == torch.float8_e4m3fn:
+        intermediate_cache2, a_scales2 = marlin_quant_input(
+            intermediate_cache2, input_dtype
+        )
+
     output = ops.moe_wna16_marlin_gemm(
         intermediate_cache2,
         output,
         w2,
         bias2,
         w2_scale,
+        a_scales2,
         global_scale2,
         w2_zeros,
         g_idx2,
@@ -175,7 +195,7 @@ def _fused_marlin_moe(
         size_n=K,
         size_k=N,
         is_k_full=is_k_full,
-        use_atomic_add=use_atomic_add,
+        use_atomic_add=False,
         use_fp32_reduce=True,
         is_zp_float=False,
     )
@@ -203,6 +223,8 @@ def fused_marlin_moe(
     ] = default_activation_func,
     moe_sum: Callable[[torch.Tensor, torch.Tensor], None] | None = None,
     expert_map: torch.Tensor | None = None,
+    input_global_scale1: torch.Tensor | None = None,
+    input_global_scale2: torch.Tensor | None = None,
     global_scale1: torch.Tensor | None = None,
     global_scale2: torch.Tensor | None = None,
     g_idx1: torch.Tensor | None = None,
@@ -216,6 +238,7 @@ def fused_marlin_moe(
     intermediate_cache2: torch.Tensor | None = None,
     is_k_full: bool = True,
     output: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     inplace: bool = False,
 ) -> torch.Tensor:
     """
@@ -287,6 +310,9 @@ def fused_marlin_moe(
         if M * topk / E / block_size_m < 0.9:
             break
 
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        block_size_m = max(block_size_m, 16)
+
     if global_num_experts == -1:
         global_num_experts = E
     sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
@@ -313,6 +339,8 @@ def fused_marlin_moe(
         num_tokens_post_padded=num_tokens_post_padded,
         activation=activation,
         activation_func=activation_func,
+        input_global_scale1=input_global_scale1,
+        input_global_scale2=input_global_scale2,
         global_scale1=global_scale1,
         global_scale2=global_scale2,
         g_idx1=g_idx1,
@@ -325,6 +353,7 @@ def fused_marlin_moe(
         intermediate_cache13=intermediate_cache13,
         intermediate_cache2=intermediate_cache2,
         output=None,
+        input_dtype=input_dtype,
         is_k_full=is_k_full,
     ).view(-1, topk, K)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 2e042d85fcfc..df208eae2e71 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -872,8 +872,10 @@ def get_moe_configs(
     for config_file_path in config_file_paths:
         if os.path.exists(config_file_path):
             with open(config_file_path) as f:
-                logger.info(
-                    "Using configuration from %s for MoE layer.", config_file_path
+                logger.info_once(
+                    "Using configuration from %s for MoE layer.",
+                    config_file_path,
+                    scope="global",
                 )
                 # If a configuration has been found, return it
                 tuned_config = json.load(f)
@@ -1246,7 +1248,6 @@ def eplb_map_to_physical_and_record(
     expert_load_view: torch.Tensor,
     logical_to_physical_map: torch.Tensor,
     logical_replica_count: torch.Tensor,
-    indices_type: torch.dtype | None = None,
 ) -> torch.Tensor:
     """
     Map the logical expert ids to physical expert ids
@@ -1260,7 +1261,6 @@ def eplb_map_to_physical_and_record(
         expert_load_view: The expert load view.
         logical_to_physical_map: The logical to physical map.
         logical_replica_count: The logical replica count.
-        indices_type: The indices type.
 
     Returns:
         The physical expert ids.
@@ -1310,9 +1310,6 @@ def eplb_map_to_physical_and_record(
         index=topk_ids_flatten.long(),
         src=torch.ones_like(topk_ids_flatten).to(expert_load_view),
     )
-
-    if indices_type is not None:
-        topk_ids = topk_ids.to(dtype=indices_type)
     return topk_ids
 
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
index 87f8c8d75a9b..ef7090c349fc 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
@@ -50,10 +50,15 @@ def uses_weight_scale_2_pattern(self) -> bool:
         """
         return False
 
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> FusedMoEPrepareAndFinalize | None:
         from .all2all_utils import maybe_make_prepare_finalize
 
-        return maybe_make_prepare_finalize(self.moe, self.moe_quant_config)
+        return maybe_make_prepare_finalize(
+            self.moe, self.moe_quant_config, routing_tables
+        )
 
     def select_gemm_impl(
         self,
@@ -85,10 +90,14 @@ def supports_eplb(self) -> bool:
     def allow_inplace(self) -> bool:
         return False
 
+    @property
+    def method_name(self) -> str:
+        return self.__class__.__name__
+
     @abstractmethod
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
index 43974ba917e4..c23c41df226f 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py
@@ -50,6 +50,7 @@ def make(
                 prepare_finalize,
                 old_quant_method.select_gemm_impl(prepare_finalize, moe_layer),
                 shared_experts,
+                getattr(moe_layer, "shared_experts_stream", None),
             ),
         )
 
@@ -65,6 +66,10 @@ def supports_eplb(self) -> bool:
     def allow_inplace(self) -> bool:
         return self.old_quant_method.allow_inplace
 
+    @property
+    def method_name(self) -> str:
+        return self.old_quant_method.method_name
+
     def create_weights(
         self,
         layer: torch.nn.Module,
@@ -83,7 +88,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -104,42 +109,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        # Is getattr needed?
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        if enable_eplb:
-            if self.supports_eplb:
-                assert expert_load_view is not None
-                assert logical_to_physical_map is not None
-                assert logical_replica_count is not None
-            else:
-                raise NotImplementedError(
-                    "EPLB is not supported for "
-                    f"{self.old_quant_method.__class__.__name__}."
-                )
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
         )
 
         result = self.fused_experts(
@@ -155,7 +127,7 @@ def apply(
             expert_map=None if self.disable_expert_map else expert_map,
         )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index badedfc54c38..0b006e15632e 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
@@ -12,6 +14,7 @@
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
 )
+from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.triton_utils import tl, triton
 from vllm.utils.import_utils import has_triton_kernels
 
@@ -88,14 +91,17 @@ def triton_kernel_moe_forward(
         gating_output, topk, sm_first=not renormalize
     )
 
+    output = torch.empty_like(hidden_states)
+
     return triton_kernel_fused_experts(
-        None,
+        output,
         hidden_states,
         w1,
         w2,
         routing_data,
         gather_idx,
         scatter_idx,
+        topk=topk,
         activation=activation,
         quant_config=quant_config,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -113,6 +119,7 @@ def triton_kernel_fused_experts(
     routing_data,  # RoutingData
     gather_indx,  # GatherIndx
     scatter_indx,  # ScatterIndx
+    topk: int,
     activation: str = "silu",
     quant_config: FusedMoEQuantConfig | None = None,
     swiglu_alpha: float = 1.702,
@@ -120,6 +127,7 @@ def triton_kernel_fused_experts(
     apply_router_weight_on_input: bool = False,
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
+    intermediate_cache: torch.Tensor | None = None,
     a1q_scale: torch.Tensor | None = None,
 ) -> torch.Tensor:
     if quant_config is None:
@@ -131,14 +139,30 @@ def triton_kernel_fused_experts(
     assert quant_config.w2_bias is None or quant_config.w2_bias.dtype == torch.float32
 
     # Shape check, only check non-mxfp4
+    assert hidden_states.ndim == 2
     assert hidden_states.shape[-1] == w1.shape[-2]
     assert w2.shape[-1] == w1.shape[1]
 
+    batch_dim = 1
+    M, K = hidden_states.shape[-2:]
     E, _, N = w1.shape
 
     if global_num_experts == -1:
         global_num_experts = E
 
+    if intermediate_cache is None:
+        intermediate_cache = torch.empty(
+            (batch_dim, M * topk, N // 2),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+
+    # Add batch_dim to output buffer because matmul_ogs expects 3D output
+    intermediate_cache = _resize_cache(
+        intermediate_cache, (batch_dim, M * topk, N // 2)
+    )
+    output_tensor = _resize_cache(output_tensor, (batch_dim, M, K))
+
     act = FusedActivation(
         FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")),
         (swiglu_alpha, swiglu_limit),
@@ -146,7 +170,7 @@ def triton_kernel_fused_experts(
     )
     gammas = routing_data.gate_scal if routing_data else None
 
-    intermediate_cache1 = matmul_ogs(
+    matmul_ogs(
         hidden_states,
         w1,
         quant_config.w1_bias,
@@ -155,10 +179,11 @@ def triton_kernel_fused_experts(
         precision_config=quant_config.w1_precision,
         gammas=gammas if apply_router_weight_on_input else None,
         fused_activation=act,
+        y=intermediate_cache,
     )
 
-    intermediate_cache3 = matmul_ogs(
-        intermediate_cache1,
+    matmul_ogs(
+        intermediate_cache.view(M * topk, N // 2),
         w2,
         quant_config.w2_bias,
         routing_data,
@@ -167,7 +192,8 @@ def triton_kernel_fused_experts(
         gammas=None if apply_router_weight_on_input else gammas,
         y=output_tensor,
     )
-    return intermediate_cache3
+    output_tensor = output_tensor.view(M, K)
+    return output_tensor
 
 
 def make_routing_data(
@@ -221,6 +247,42 @@ def __init__(self, quant_config: FusedMoEQuantConfig):
     def supports_expert_map(self) -> bool:
         return True
 
+    def moe_problem_size(
+        self,
+        a1: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_ids: torch.Tensor,
+    ) -> tuple[int, int, int, int, int]:
+        """
+        Extract the MoE problem size from the given tensor arguments:
+        - a: The hidden states, input to the MoE layer.
+        - w1: The first set of expert weights.
+        - w2: The second set of expert weights.
+        - topk_ids: The topk ids.
+        Note: extracting the problem shape from the weight and activation
+        tensors is not obvious.  It needs to be done this way specifically
+        due to subtle issues with particular kernels, e.g. the int4 kernels
+        divide the trailing dimension by two, so it's not "correct" to
+        extract N or K from the trailing dimension of w1 or w2.  Similarly,
+        some kernels transpose the weights, so this needs to be kept in mind.
+        Note: This implementation covers most cases. However, if experts
+        require a specialized implementation, like MarlinExperts, they are free
+        to override this function.
+        """
+        assert w1.dim() == 3 and w2.dim() == 3
+        E, _, N = w1.size()
+        K = a1.size(-1)
+
+        assert a1.dim() == 2
+        assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
+        M = a1.size(0)
+
+        assert topk_ids.dim() == 2
+        topk = topk_ids.size(1)
+
+        return E, M, N, K, topk
+
     def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
         # Weight application and reduction happens in the fused_experts kernel.
         return TopKWeightAndReduceNoOP()
@@ -263,8 +325,8 @@ def workspace_shapes(
         expert_tokens_meta: mk.ExpertTokensMetadata | None,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
         # workspace are allocated inside the kernel
-        workspace1 = (M, K)
-        workspace2 = (0, 0)
+        workspace1 = (0, 0)
+        workspace2 = (M * topk, N // 2)
         output = (M, K)
         return (workspace1, workspace2, output)
 
@@ -297,20 +359,166 @@ def apply(
             topk_ids, topk_weights, local_num_experts
         )
 
-        experts_output = triton_kernel_fused_experts(
-            None,
+        topk = topk_ids.size(1)
+        triton_kernel_fused_experts(
+            output,
             hidden_states,
             w1,
             w2,
             routing_data,
             gather_indx,
             scatter_indx,
+            topk=topk,
             activation=activation,
             quant_config=self.quant_config,
             apply_router_weight_on_input=False,
             global_num_experts=local_num_experts,
             expert_map=None,  # applied already
+            intermediate_cache=workspace2,
             a1q_scale=a1q_scale,
         )
 
-        output.copy_(experts_output, non_blocking=True)
+
+class UnfusedOAITritonExperts(BaseOAITritonExperts):
+    """
+    A Triton based MoE expert class that operates on expert standard
+    format and explicitly keeps the activation and reduction (moe_sum) steps
+    unfused from the matmul_ogs kernel. This exposes injection points
+    for activation and moe_sum.
+
+    One use case for it is to inject LoRA modules on the activation and moe_sum.
+    """
+
+    def __init__(self, quant_config: FusedMoEQuantConfig):
+        # TODO (varun) : Enable activation quantization
+        assert quant_config.use_mxfp4_w4a16, "Supports only mxfp4_w4a16"
+        super().__init__(quant_config)
+
+    @property
+    def activation_formats(
+        self,
+    ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (
+            mk.FusedMoEActivationFormat.Standard,
+            mk.FusedMoEActivationFormat.Standard,
+        )
+
+    def supports_chunking(self) -> bool:
+        return True
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # workspace are allocated inside the kernel
+        workspace1 = (M * topk, N // 2)
+        workspace2 = (M * topk, max(N, K))
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def moe_sum(self, input: torch.Tensor, output: torch.Tensor):
+        ops.moe_sum(input, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: str,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        if self.quant_config is None:
+            self.quant_config = FUSED_MOE_UNQUANTIZED_CONFIG
+
+        if expert_map is not None:
+            topk_ids = expert_map[topk_ids]
+
+        local_num_experts = w1.size(0)
+        if global_num_experts == -1:
+            global_num_experts = local_num_experts
+
+        routing_data, gather_indx, scatter_indx = self._make_routing_data(
+            topk_ids, topk_weights, local_num_experts
+        )
+
+        topk = topk_ids.size(1)
+
+        # type check, uint8 means mxfp4
+        assert hidden_states.dtype == torch.bfloat16
+        assert (
+            self.quant_config.w1_bias is None
+            or self.quant_config.w1_bias.dtype == torch.float32
+        )
+        assert (
+            self.quant_config.w2_bias is None
+            or self.quant_config.w2_bias.dtype == torch.float32
+        )
+
+        # Shape check, only check non-mxfp4
+        assert hidden_states.ndim == 2
+        assert hidden_states.shape[-1] == w1.shape[-2]
+        assert w2.shape[-1] == w1.shape[1]
+
+        batch_dim = 1
+        M, K = hidden_states.shape
+        E, _, N = w1.shape
+
+        if global_num_experts == -1:
+            global_num_experts = E
+
+        # Note that the output tensor might be in workspace13
+        intermediate_cache1 = _resize_cache(workspace2, (batch_dim, M * topk, N))
+        intermediate_cache3 = _resize_cache(workspace2, (batch_dim, M * topk, K))
+        intermediate_cache2 = _resize_cache(workspace13, (M * topk, N // 2))
+
+        gammas = routing_data.gate_scal if routing_data else None
+
+        matmul_ogs(
+            hidden_states,
+            w1,
+            self.quant_config.w1_bias,
+            routing_data,
+            gather_indx=gather_indx,
+            precision_config=self.quant_config.w1_precision,
+            gammas=gammas if apply_router_weight_on_input else None,
+            fused_activation=None,
+            y=intermediate_cache1,
+        )
+
+        self.activation(
+            activation, intermediate_cache2, intermediate_cache1.view(-1, N)
+        )
+
+        # matmul_ogs grouped reduction fuse sum across multiple experts:
+        # y[dst_ind // n_expts_act, :] += x[src_ind, :]
+        # Need to set n_expts_act to 1 to unfuse moe_sum
+        routing_data.n_expts_act = 1
+
+        matmul_ogs(
+            intermediate_cache2,
+            w2,
+            self.quant_config.w2_bias,
+            routing_data,
+            scatter_indx=scatter_indx,
+            precision_config=self.quant_config.w2_precision,
+            gammas=None if apply_router_weight_on_input else gammas,
+            y=intermediate_cache3,
+        )
+
+        self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index aed8245cbd83..902a77987d61 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -5,7 +5,7 @@
 from contextlib import nullcontext
 from enum import Enum
 from functools import partial
-from typing import Literal, get_args, overload
+from typing import Literal, cast, get_args, overload
 
 import torch
 import torch.nn.functional as F
@@ -18,6 +18,7 @@
 from vllm.distributed import (
     get_dp_group,
     get_ep_group,
+    get_pcp_group,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
@@ -67,7 +68,6 @@ def _eplb_map_to_physical_and_record(
         expert_load_view: torch.Tensor,
         logical_to_physical_map: torch.Tensor,
         logical_replica_count: torch.Tensor,
-        indices_type: torch.dtype | None,
     ) -> torch.Tensor:
         # CPU fallback: no EPLB so just return as is
         return topk_ids
@@ -192,6 +192,42 @@ def determine_expert_map(
     return (local_num_experts, expert_map, expert_mask)
 
 
+def determine_expert_placement_strategy(
+    expert_placement_strategy: ExpertPlacementStrategy,
+    moe_parallel_config: FusedMoEParallelConfig,
+    num_expert_group: int | None,
+    num_redundant_experts: int,
+    enable_eplb: bool,
+) -> ExpertPlacementStrategy:
+    if expert_placement_strategy == "round_robin":
+        round_robin_supported = (
+            (num_expert_group is not None and num_expert_group > 1)
+            and num_redundant_experts == 0
+            and not enable_eplb
+        )
+
+        if not round_robin_supported:
+            logger.warning(
+                "Round-robin expert placement is only supported for "
+                "models with multiple expert groups and no redundant "
+                "experts. Falling back to linear expert placement."
+            )
+            return "linear"
+        if (
+            moe_parallel_config.use_all2all_kernels
+            and not moe_parallel_config.use_deepep_ll_kernels
+        ):
+            logger.warning(
+                "Round-robin expert placement currently only supports "
+                "the DeepEP low-latency backend, but '%s' was configured. "
+                "Falling back to linear expert placement.",
+                moe_parallel_config.all2all_backend,
+            )
+            return "linear"
+
+    return expert_placement_strategy
+
+
 def get_compressed_expert_map(expert_map: torch.Tensor) -> str:
     """
     Compresses the expert map by removing any -1 entries.
@@ -307,6 +343,7 @@ def __init__(
         tp_size: int | None = None,
         ep_size: int | None = None,
         dp_size: int | None = None,
+        pcp_size: int | None = None,
         prefix: str = "",
         custom_routing_function: Callable | None = None,
         scoring_func: str = "softmax",
@@ -335,8 +372,8 @@ def __init__(
             logger.info_once("Disabling MoE shared_experts cuda stream")
             self.shared_experts_stream = None
         else:
-            # TODO(rob): enable shared expert overlap with non-cuda.
-            # aux_stream() returns None on non-cuda platforms.
+            # TODO(rob): enable shared expert overlap with non-cuda-alike.
+            # aux_stream() returns None on non-cuda-alike platforms.
             self.shared_experts_stream = aux_stream()
             if self.shared_experts_stream is not None:
                 logger.info_once("Enabled separate cuda stream for MoE shared_experts")
@@ -362,12 +399,14 @@ def __init__(
             tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
         )
         dp_size_ = dp_size if dp_size is not None else get_dp_group().world_size
+        pcp_size_ = pcp_size if pcp_size is not None else get_pcp_group().world_size
 
         self.is_sequence_parallel = is_sequence_parallel
         self.sp_size = tp_size_ if is_sequence_parallel else 1
 
         self.moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
             tp_size_=tp_size_,
+            pcp_size_=pcp_size_,
             dp_size_=dp_size_,
             vllm_parallel_config=vllm_config.parallel_config,
         )
@@ -400,6 +439,9 @@ def __init__(
         self.expert_load_view: torch.Tensor | None = None
         self.logical_to_physical_map: torch.Tensor | None = None
         self.logical_replica_count: torch.Tensor | None = None
+        self.expert_placement_strategy: ExpertPlacementStrategy = (
+            vllm_config.parallel_config.expert_placement_strategy
+        )
 
         # ROCm aiter shared experts fusion
         self.rocm_aiter_fmoe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
@@ -433,38 +475,27 @@ def __init__(
                     "Redundant experts are only supported with EPLB."
                 )
 
-            expert_placement_strategy = (
-                vllm_config.parallel_config.expert_placement_strategy
+            self.expert_placement_strategy = determine_expert_placement_strategy(
+                expert_placement_strategy=self.expert_placement_strategy,
+                moe_parallel_config=self.moe_parallel_config,
+                num_expert_group=num_expert_group,
+                num_redundant_experts=num_redundant_experts,
+                enable_eplb=self.enable_eplb,
             )
-            if expert_placement_strategy == "round_robin":
-                # TODO(Bruce): will support round robin expert placement with
-                # EPLB enabled in the future.
-                round_robin_supported = (
-                    (num_expert_group is not None and num_expert_group > 1)
-                    and num_redundant_experts == 0
-                    and not self.enable_eplb
-                )
-
-                if not round_robin_supported:
-                    logger.warning(
-                        "Round-robin expert placement is only supported for "
-                        "models with multiple expert groups and no redundant "
-                        "experts. Falling back to linear expert placement."
-                    )
-                    expert_placement_strategy = "linear"
 
             self.expert_map: torch.Tensor | None
             local_num_experts, expert_map, expert_mask = determine_expert_map(
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
-                expert_placement_strategy=expert_placement_strategy,
+                expert_placement_strategy=self.expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
+            self._maybe_init_expert_routing_tables()
             logger.info_once(
                 "[EP Rank %s/%s] Expert parallelism is enabled. Expert "
                 "placement strategy: %s. Local/global"
@@ -472,7 +503,7 @@ def __init__(
                 " %s.",
                 self.ep_rank,
                 self.ep_size,
-                expert_placement_strategy,
+                self.expert_placement_strategy,
                 self.local_num_experts,
                 self.global_num_experts,
                 get_compressed_expert_map(self.expert_map),
@@ -542,6 +573,9 @@ def __init__(
             is_act_and_mul=is_act_and_mul,
             is_lora_enabled=vllm_config.lora_config is not None,
         )
+        self.moe_config_use_flashinfer_cutlass_kernels = (
+            self.moe_config.use_flashinfer_cutlass_kernels
+        )
 
         self.quant_config = quant_config
 
@@ -566,14 +600,20 @@ def _get_quant_method() -> FusedMoEMethodBase:
             # Avoid circular import
             from vllm.model_executor.layers.quantization.modelopt import (
                 ModelOptFp8MoEMethod,
+                ModelOptNvFp4FusedMoE,
             )
 
             if not isinstance(
-                self.quant_method, (UnquantizedFusedMoEMethod, ModelOptFp8MoEMethod)
+                self.quant_method,
+                (
+                    UnquantizedFusedMoEMethod,
+                    ModelOptFp8MoEMethod,
+                    ModelOptNvFp4FusedMoE,
+                ),
             ):
                 raise NotImplementedError(
                     "is_act_and_mul=False is supported only for unquantized "
-                    "and ModelOpt FP8 moe for now"
+                    ", ModelOpt FP8, and ModelOpt NvFp4 checkpoints"
                 )
             if not current_platform.is_cuda():
                 raise NotImplementedError(
@@ -621,7 +661,12 @@ def _get_quant_method() -> FusedMoEMethodBase:
     # should be safe to swap out the quant_method.
     def maybe_init_modular_kernel(self) -> None:
         self.ensure_moe_quant_config_init()
-        prepare_finalize = self.quant_method.maybe_make_prepare_finalize()
+        # routing_tables only needed for round-robin expert placement with
+        # DeepEP all2all backend.
+        routing_tables = self._maybe_init_expert_routing_tables()
+        prepare_finalize = self.quant_method.maybe_make_prepare_finalize(
+            routing_tables=routing_tables
+        )
         if prepare_finalize is not None:
             logger.debug(
                 "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
@@ -646,6 +691,10 @@ def tp_size(self):
     def dp_size(self):
         return self.moe_parallel_config.dp_size
 
+    @property
+    def pcp_size(self):
+        return self.moe_parallel_config.pcp_size
+
     @property
     def ep_size(self):
         return self.moe_parallel_config.ep_size
@@ -658,6 +707,10 @@ def tp_rank(self):
     def dp_rank(self):
         return self.moe_parallel_config.dp_rank
 
+    @property
+    def pcp_rank(self):
+        return self.moe_parallel_config.pcp_rank
+
     @property
     def ep_rank(self):
         return self.moe_parallel_config.ep_rank
@@ -683,7 +736,7 @@ def use_flashinfer_cutlass_kernels(self):
         return (
             self.moe_quant_config is not None
             and self.moe_quant_config.quant_dtype == "nvfp4"
-            and self.moe_config.use_flashinfer_cutlass_kernels
+            and self.moe_config_use_flashinfer_cutlass_kernels
         )
 
     @property
@@ -703,6 +756,84 @@ def is_internal_router(self) -> bool:
         # By default, router/gate is called before FusedMoE forward pass
         return False
 
+    def _maybe_init_expert_routing_tables(
+        self,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
+        # Currently routing_tables only needed for round-robin expert placement
+        # with DeepEP-ll all2all backend.
+        if (
+            self.expert_placement_strategy != "round_robin"
+            or not self.use_deepep_ll_kernels
+        ):
+            return None
+
+        if hasattr(self, "expert_global_to_physical"):
+            return cast(
+                tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+                (
+                    self.expert_global_to_physical,
+                    self.expert_physical_to_global,
+                    self.expert_local_to_global,
+                ),
+            )
+
+        if self.expert_map is None:
+            return None
+
+        routing_tables = self.ensure_round_robin_expert_routing_tables(
+            global_num_experts=self.global_num_experts,
+            ep_size=self.ep_size,
+            ep_rank=self.ep_rank,
+            local_num_experts=self.local_num_experts,
+            device=self.expert_map.device,
+        )
+
+        global_to_physical, physical_to_global, local_global = routing_tables
+        self.register_buffer("expert_global_to_physical", global_to_physical)
+        self.register_buffer("expert_physical_to_global", physical_to_global)
+        self.register_buffer("expert_local_to_global", local_global)
+
+        return routing_tables
+
+    @staticmethod
+    def ensure_round_robin_expert_routing_tables(
+        global_num_experts: int,
+        ep_size: int,
+        ep_rank: int,
+        local_num_experts: int,
+        device: torch.device | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        device_kwargs = {"device": device} if device is not None else {}
+        global_indices = torch.arange(
+            global_num_experts, dtype=torch.long, **device_kwargs
+        )
+        owner = torch.remainder(global_indices, ep_size)
+        local_index = torch.div(global_indices, ep_size, rounding_mode="floor")
+        base = global_num_experts // ep_size
+        remainder = global_num_experts % ep_size
+        physical_offset = owner * base
+        if remainder > 0:
+            remainder_tensor = torch.tensor(
+                remainder, dtype=torch.long, **device_kwargs
+            )
+            physical_offset = physical_offset + torch.minimum(owner, remainder_tensor)
+
+        global_to_physical = physical_offset + local_index
+        physical_to_global = torch.empty_like(global_to_physical)
+        physical_to_global[global_to_physical] = global_indices
+
+        local_global = torch.arange(
+            ep_rank,
+            global_num_experts,
+            ep_size,
+            dtype=torch.long,
+            **device_kwargs,
+        )
+        if local_global.numel() != local_num_experts:
+            local_global = local_global[:local_num_experts]
+
+        return (global_to_physical, physical_to_global, local_global)
+
     def update_expert_map(self):
         # ep_size and ep_rank should already be updated
         assert self.expert_map is not None
@@ -711,18 +842,59 @@ def update_expert_map(self):
                 ep_size=self.ep_size,
                 ep_rank=self.ep_rank,
                 global_num_experts=self.global_num_experts,
+                expert_placement_strategy=self.expert_placement_strategy,
                 num_fused_shared_experts=self.num_fused_shared_experts,
                 return_expert_mask=self.rocm_aiter_fmoe_enabled,
             )
             self.local_num_experts = local_num_experts
             self.register_buffer("expert_map", expert_map)
             self.register_buffer("expert_mask", expert_mask)
+            self._maybe_init_expert_routing_tables()
             if self.aiter_fmoe_shared_expert_enabled:
                 self._init_aiter_shared_experts_topK_buffer(
                     vllm_config=get_current_vllm_config(),
                     dp_size=get_dp_group().world_size,
                 )
 
+    def _maybe_setup_shared_experts_stream(
+        self,
+        hidden_states: torch.Tensor,
+        has_separate_shared_experts: bool,
+        use_chunked_impl: bool,
+    ) -> tuple[bool, torch.Tensor | None]:
+        use_shared_experts_stream = (
+            has_separate_shared_experts
+            and not use_chunked_impl
+            and self.shared_experts_stream is not None
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        hidden_states_clone: torch.Tensor | None = None
+        if use_shared_experts_stream:
+            assert self.shared_experts_stream is not None
+
+            # Clone BEFORE switching streams to avoid race condition
+            # where routed_expert kernel may mutate hidden_states.
+            hidden_states_clone = hidden_states.clone()
+
+            # Record that the clone will be used by shared_experts_stream
+            # to avoid gc issue from deallocation of hidden_states_clone
+            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
+            # NOTE: We dont need shared_output.record_stream(current_stream())
+            # because we synch the streams before using shared_output.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+
+            # Mark sync start point for the separate shared experts
+            # stream here since we want to run in parallel with the
+            # router/gate (next op below)
+            assert self.shared_experts_stream is not None
+            self.shared_experts_stream.wait_stream(current_stream())
+
+        return use_shared_experts_stream, hidden_states_clone
+
     def _load_per_tensor_weight_scale(
         self,
         shard_id: str,
@@ -1111,7 +1283,7 @@ def weight_loader(
                     self._load_combined_w13_weight_scale(
                         shard_dim=shard_dim,
                         loaded_weight=loaded_weight,
-                        param=param,
+                        param=expert_data,
                         tp_rank=self.tp_rank,
                     )
                     return True if return_success else None
@@ -1225,7 +1397,48 @@ def load_weights(
                     yield param_name
 
     def get_expert_weights(self) -> Iterable[torch.Tensor]:
+        def _maybe_make_contiguous(
+            name: str, p: torch.nn.Parameter
+        ) -> torch.nn.Parameter:
+            """
+            In some cases, the last 2 dimensions (the non-expert dimensions)
+            of the weight scale tensor are transposed. This function
+            transforms the tensor (view update) so the tensor is contiguous().
+            Example: A non-contiguous scale tensor,
+              `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to
+              `x_` of shape (E, 16, 32) and stride (512, 32, 1).
+              Note that we specifically use torch.transpose() so `x_` refers
+              to the same underlying memory. The tensors `x` and `x_`, pointing
+              to the same underlying memory make this transformation safe in the
+              context of EPLB. i.e. It is the same memory and just the view
+              is different.
+            Note: This function handles the "weight_scale" tensors specifically.
+            This could however be generalized to handle similar tensors.
+            """
+            if p.ndim != 3:
+                return p
+            if p.is_contiguous():
+                # Already contiguous. do nothing.
+                return p
+            # p is non-contiguous. We only handle the case where the last 2
+            # dimensions of the scales tensor is transposed. We can handle
+            # other cases when they become relevant.
+            is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1
+            if "weight_scale" not in name or not is_transposed_12:
+                # do nothing.
+                return p
+
+            # Do not update the layer parameter as the layer's MoE operations would
+            # expect the parameter's tensor to the same shape / stride. Instead,
+            # make a new torch.nn.Parameter that is used just in the context of
+            # EPLB.
+            return torch.nn.Parameter(
+                torch.transpose(p.data, 1, 2), requires_grad=False
+            )
+
         weights = list(self.named_parameters())
+        weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights]
+
         assert all(
             weight.is_contiguous()
             for name, weight in weights
@@ -1303,30 +1516,11 @@ def ensure_dp_chunking_init(self):
             logits_shape, dtype=moe.in_dtype, device=torch.cuda.current_device()
         )
 
-    @staticmethod
     def select_experts(
+        self,
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
-        top_k: int,
-        use_grouped_topk: bool,
-        renormalize: bool,
-        topk_group: int | None = None,
-        num_expert_group: int | None = None,
-        custom_routing_function: Callable | None = None,
-        scoring_func: str = "softmax",
-        routed_scaling_factor: float = 1.0,
-        e_score_correction_bias: torch.Tensor | None = None,
-        indices_type: torch.dtype | None = None,
-        enable_eplb: bool = False,
-        expert_map: torch.Tensor | None = None,
-        expert_load_view: torch.Tensor | None = None,
-        logical_to_physical_map: torch.Tensor | None = None,
-        logical_replica_count: torch.Tensor | None = None,
-        global_num_experts: int | None = None,
-        zero_expert_num: int | None = None,
-        zero_expert_type: str | None = None,
-        num_fused_shared_experts: int = 0,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor | None]:
         """
         Route the input hidden states to the top-k experts based on the
         router logits.
@@ -1345,6 +1539,27 @@ def select_experts(
             fused_topk_bias,
         )
 
+        if self.enable_eplb:
+            if self.quant_method.supports_eplb:
+                if self.expert_load_view is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere expert_load_view != None"
+                    )
+                if self.logical_to_physical_map is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_to_physical_map != None"
+                    )
+                if self.logical_replica_count is None:
+                    raise ValueError(
+                        "enable_eplb=True requiere logical_replica_count != None"
+                    )
+            else:
+                raise NotImplementedError(
+                    f"EPLB is not supported for {self.quant_method.method_name}."
+                )
+
+        indices_type = self.quant_method.topk_indices_dtype
+
         # Check if we should use a routing simulation strategy
         routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
         if routing_strategy != "":
@@ -1352,20 +1567,20 @@ def select_experts(
                 hidden_states=hidden_states,
                 router_logits=router_logits,
                 strategy_name=routing_strategy,
-                top_k=top_k,
+                top_k=self.top_k,
                 indices_type=indices_type,
             )
 
         # DeepSeekv2 uses grouped_top_k
-        elif use_grouped_topk:
-            assert topk_group is not None
-            assert num_expert_group is not None
+        elif self.use_grouped_topk:
+            assert self.topk_group is not None
+            assert self.num_expert_group is not None
             if rocm_aiter_ops.is_fused_moe_enabled():
                 if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
-                    assert num_fused_shared_experts == 0
+                    assert self.num_fused_shared_experts == 0
                 grouped_topk_impl = partial(
                     rocm_aiter_grouped_topk,
-                    num_fused_shared_experts=num_fused_shared_experts,
+                    num_fused_shared_experts=self.num_fused_shared_experts,
                 )
             else:
                 grouped_topk_impl = grouped_topk
@@ -1373,71 +1588,65 @@ def select_experts(
             topk_weights, topk_ids = grouped_topk_impl(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
-                num_expert_group=num_expert_group,
-                topk_group=topk_group,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
+                topk=self.top_k,
+                renormalize=self.renormalize,
+                num_expert_group=self.num_expert_group,
+                topk_group=self.topk_group,
+                scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
+                e_score_correction_bias=self.e_score_correction_bias,
             )
-            if indices_type is not None:
-                topk_ids = topk_ids.to(dtype=indices_type)
-        elif e_score_correction_bias is not None:
+        elif self.e_score_correction_bias is not None:
             topk_weights, topk_ids = fused_topk_bias(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                e_score_correction_bias=e_score_correction_bias.data,
-                topk=top_k,
-                renormalize=renormalize,
+                e_score_correction_bias=self.e_score_correction_bias.data,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
-            if routed_scaling_factor is not None:
-                topk_weights *= routed_scaling_factor
-        elif custom_routing_function is None:
+            if self.routed_scaling_factor != 1.0:
+                topk_weights *= self.routed_scaling_factor
+        elif self.custom_routing_function is None:
             topk_weights, topk_ids, token_expert_indices = fused_topk(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
                 indices_type=indices_type,
             )
         else:
-            topk_weights, topk_ids = custom_routing_function(
+            topk_weights, topk_ids = self.custom_routing_function(
                 hidden_states=hidden_states,
                 gating_output=router_logits,
-                topk=top_k,
-                renormalize=renormalize,
+                topk=self.top_k,
+                renormalize=self.renormalize,
             )
-            if indices_type is not None:
-                topk_ids = topk_ids.to(dtype=indices_type)
-
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
 
+        if self.enable_eplb:
             topk_ids = eplb_map_to_physical_and_record(
                 topk_ids=topk_ids,
-                expert_load_view=expert_load_view,
-                logical_to_physical_map=logical_to_physical_map,
-                logical_replica_count=logical_replica_count,
-                indices_type=indices_type,
+                expert_load_view=self.expert_load_view,
+                logical_to_physical_map=self.logical_to_physical_map,
+                logical_replica_count=self.logical_replica_count,
             )
 
+        if (indices_type is not None) and topk_ids.dtype != indices_type:
+            topk_ids = topk_ids.to(dtype=indices_type)
+
         assert topk_ids.dtype == indices_type or indices_type is None
 
         # Compute zero expert result if needed
         if (
-            zero_expert_num is not None
-            and zero_expert_num > 0
-            and zero_expert_type is not None
-            and global_num_experts is not None
+            self.zero_expert_num is not None
+            and self.zero_expert_num > 0
+            and self.zero_expert_type is not None
+            and self.global_num_experts is not None
         ):
             zero_expert_result = zero_experts_compute_triton(
                 expert_indices=topk_ids,
                 expert_scales=topk_weights,
-                num_experts=global_num_experts,
-                zero_expert_type=zero_expert_type,
+                num_experts=self.global_num_experts,
+                zero_expert_type=self.zero_expert_type,
                 hidden_states=hidden_states,
             )
         else:
@@ -1694,36 +1903,12 @@ def forward_impl(
 
         use_chunked_impl = self.use_dp_chunking
 
-        use_shared_experts_stream = (
-            has_separate_shared_experts
-            and not use_chunked_impl
-            and self.shared_experts_stream is not None
-            and (
-                hidden_states.shape[0]
-                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+        use_shared_experts_stream, hidden_states_clone = (
+            self._maybe_setup_shared_experts_stream(
+                hidden_states, has_separate_shared_experts, use_chunked_impl
             )
         )
 
-        if use_shared_experts_stream:
-            assert self.shared_experts_stream is not None
-
-            # Clone BEFORE switching streams to avoid race condition
-            # where routed_expert kernel may mutate hidden_states.
-            hidden_states_clone = hidden_states.clone()
-
-            # Record that the clone will be used by shared_experts_stream
-            # to avoid gc issue from deallocation of hidden_states_clone
-            # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501
-            # NOTE: We dont need shared_output.record_stream(current_stream())
-            # because we synch the streams before using shared_output.
-            hidden_states_clone.record_stream(self.shared_experts_stream)
-
-            # Mark sync start point for the separate shared experts
-            # stream here since we want to run in parallel with the
-            # router/gate (next op below)
-            assert self.shared_experts_stream is not None
-            self.shared_experts_stream.wait_stream(current_stream())
-
         # If router/gate provided, then apply it here.
         # (Note: This code runs only when "overlapped mode" is on to allow
         #        parallel execution of shared experts with the FusedMoE via
@@ -1749,14 +1934,34 @@ def forward_impl(
 
         with sp_ctx:
             if do_naive_dispatch_combine:
-                hidden_states, router_logits = get_ep_group().dispatch(
+                hidden_states_combined, router_logits = get_ep_group().dispatch(
                     hidden_states, router_logits, self.is_sequence_parallel
                 )
+            # Run shared experts before matrix multiply.
+            # because matrix multiply maybe modify the hidden_states.
+            if has_separate_shared_experts and not use_shared_experts_stream:
+                assert self.shared_experts is not None
+                shared_output = self.shared_experts(hidden_states)
+
+            # NOTE: Similar with DP, PCP also needs dispatch and combine. For
+            # simplicity, AgRsAll2All was added separately for PCP here. Maybe
+            # we should modify All2AllManager abstract to better support PCP.
+            if self.pcp_size > 1:
+                hidden_states = get_pcp_group().all_gather(
+                    hidden_states,
+                    dim=0,
+                )
+                router_logits = get_pcp_group().all_gather(
+                    router_logits,
+                    dim=0,
+                )
 
             # Matrix multiply.
             final_hidden_states = self.quant_method.apply(
                 layer=self,
-                x=hidden_states,
+                x=hidden_states_combined
+                if do_naive_dispatch_combine
+                else hidden_states,
                 router_logits=router_logits,
                 top_k=self.top_k,
                 renormalize=self.renormalize,
@@ -1793,8 +1998,6 @@ def forward_impl(
                         # conflict with the main stream
                         shared_output = self.shared_experts(hidden_states_clone)
                     current_stream().wait_stream(self.shared_experts_stream)
-                else:
-                    shared_output = self.shared_experts(hidden_states)
 
                 final_hidden_states = (
                     shared_output,
@@ -1807,6 +2010,13 @@ def forward_impl(
             def combine_output(states: torch.Tensor) -> torch.Tensor:
                 if do_naive_dispatch_combine:
                     states = get_ep_group().combine(states, self.is_sequence_parallel)
+
+                if self.pcp_size > 1:
+                    states = get_pcp_group().reduce_scatter(
+                        states,
+                        dim=0,
+                    )
+
                 return states
 
             if self.shared_experts is not None:
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index a3142f37053f..b2af58cdca88 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -10,12 +10,16 @@
 import torch
 
 import vllm.envs as envs
+from vllm.config import get_current_vllm_config
+from vllm.forward_context import get_forward_context, is_forward_context_available
+from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache,
     count_expert_num_tokens,
     disable_inplace,
 )
+from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
 from vllm.v1.worker.ubatching import (
     dbo_current_ubatch_id,
@@ -25,6 +29,8 @@
     dbo_yield,
 )
 
+logger = init_logger(__name__)
+
 #
 # This file defines a set of base classes used to make MoE kernels more modular.
 # The goal is to be able to utilize different communication mechanisms with
@@ -709,11 +715,13 @@ def __init__(
         prepare_finalize: FusedMoEPrepareAndFinalize,
         fused_experts: FusedMoEPermuteExpertsUnpermute,
         shared_experts: torch.nn.Module | None = None,
+        shared_experts_stream: torch.cuda.Stream | None = None,
     ):
         super().__init__()
         self.prepare_finalize = prepare_finalize
         self.fused_experts = fused_experts
         self.shared_experts = shared_experts
+        self.shared_experts_stream = shared_experts_stream
 
         self._post_init_setup()
         assert (
@@ -795,6 +803,42 @@ def _allocate_buffers(
         buffers = self.shared_buffers[ubatch_idx]
         workspace_dtype = self.fused_experts.workspace_dtype(out_dtype)
 
+        # Force worst-case allocation in profiling run for
+        # "mk.FusedMoEModularKernel.Standard" formats where this is only bounded
+        # by `VLLM_FUSED_MOE_CHUNK_SIZE` and may not be seen during profiling with
+        # DP+EP due to the random token routing.
+        is_profile_run = (
+            is_forward_context_available()
+            and get_forward_context().attn_metadata is None
+        )
+        if is_profile_run and self.fused_experts.supports_chunking():
+            parallel_config = get_current_vllm_config().parallel_config
+            is_dp_ep = (
+                parallel_config.data_parallel_size > 1
+                and parallel_config.enable_expert_parallel
+            )
+            if is_dp_ep:
+                max_workspace_13, max_workspace_2, max_fused_out_shape = (
+                    self.fused_experts.workspace_shapes(
+                        envs.VLLM_FUSED_MOE_CHUNK_SIZE,
+                        N,
+                        K,
+                        top_k,
+                        global_num_experts,
+                        local_num_experts,
+                        expert_tokens_meta,
+                    )
+                )
+                buffers.workspace13.get(
+                    max_workspace_13, device=device, dtype=workspace_dtype
+                )
+                buffers.workspace2.get(
+                    max_workspace_2, device=device, dtype=workspace_dtype
+                )
+                buffers.fused_out.get(
+                    max_fused_out_shape, device=device, dtype=workspace_dtype
+                )
+
         # Get intermediate workspace shapes based off the chunked M size.
         workspace13_shape, workspace2_shape, _ = self.fused_experts.workspace_shapes(
             M_chunk,
@@ -890,6 +934,34 @@ def _slice_expert_tokens_metadata(
             expert_num_tokens_cpu=c_expert_num_tokens_cpu,
         )
 
+    def _maybe_setup_shared_experts_stream(
+        self, hidden_states: torch.Tensor
+    ) -> tuple[bool, torch.Tensor | None]:
+        # decide whether to run shared experts on a separate CUDA stream to
+        # overlap with the main fused MoE kernel.
+        use_shared_experts_stream = (
+            self.shared_experts is not None
+            and self.shared_experts_stream is not None
+            and hidden_states.is_cuda
+            and (
+                hidden_states.shape[0]
+                <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD
+            )
+        )
+
+        hidden_states_clone: torch.Tensor | None = None
+        if use_shared_experts_stream and self.shared_experts_stream is not None:
+            # TODO: Optimize this (complicated)
+            # Note: this clone adds overhead but is required
+            # for correctness with multiple CUDA streams and CUDA graph capture.
+            hidden_states_clone = hidden_states.clone()
+            # record that the clone will be used by the separate stream so its
+            # lifetime is correctly tracked.
+            hidden_states_clone.record_stream(self.shared_experts_stream)
+            self.shared_experts_stream.wait_stream(torch.cuda.current_stream())
+
+        return use_shared_experts_stream, hidden_states_clone
+
     def _prepare(
         self,
         hidden_states: torch.Tensor,
@@ -1060,7 +1132,7 @@ def input_chunk_range(chunk_idx: int) -> tuple[int, int]:
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
                 a1q_scale=_slice_scales(a1q_scale, s, e),
-                a2_scale=_slice_scales(self.fused_experts.a2_scale, e, e),
+                a2_scale=_slice_scales(self.fused_experts.a2_scale, s, e),
                 workspace13=workspace13,
                 workspace2=workspace2,
                 expert_tokens_meta=c_expert_tokens_meta,
@@ -1077,12 +1149,30 @@ def _finalize(
         topk_weights: torch.Tensor,
         topk_ids: torch.Tensor,
         apply_router_weight_on_input: bool,
+        hidden_states_clone: torch.Tensor | None = None,
+        use_shared_experts_stream: bool = False,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         """
         The _finalize method is a wrapper around self.prepare_finalize.finalize
         that handles DBO, async and shared expert overlap.
         """
-        shared_output: torch.Tensor | None = None
+
+        def maybe_run_shared_experts() -> torch.Tensor | None:
+            if self.shared_experts is None:
+                return None
+
+            if (
+                not use_shared_experts_stream
+                or self.shared_experts_stream is not None
+                and (not hidden_states.is_cuda or not torch.cuda.is_available())
+            ):
+                # fall back to running on the current stream
+                return self.shared_experts(hidden_states)
+
+            assert hidden_states_clone is not None
+            # launch shared experts on the dedicated stream.
+            with torch.cuda.stream(self.shared_experts_stream):
+                return self.shared_experts(hidden_states_clone)
 
         if not self.prepare_finalize.supports_async():
             assert not dbo_enabled()
@@ -1095,8 +1185,7 @@ def _finalize(
                 apply_router_weight_on_input,
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(hidden_states)
+            shared_output = maybe_run_shared_experts()
         else:
             finalize_ret = self.prepare_finalize.finalize_async(
                 output,
@@ -1107,8 +1196,7 @@ def _finalize(
                 self.fused_experts.finalize_weight_and_reduce_impl(),
             )
 
-            if self.shared_experts is not None:
-                shared_output = self.shared_experts(hidden_states)
+            shared_output = maybe_run_shared_experts()
 
             # TODO(lucas): refactor this in the alternative schedules followup
             # currently unpack if we have hook + receiver pair or just
@@ -1131,12 +1219,28 @@ def _finalize(
 
             receiver()
 
+        self._wait_for_shared_experts_stream(hidden_states, use_shared_experts_stream)
+
         if self.shared_experts is None:
             return output
         else:
             assert shared_output is not None
             return shared_output, output
 
+    def _wait_for_shared_experts_stream(
+        self, hidden_states: torch.Tensor, use_shared_experts_stream: bool
+    ) -> None:
+        # ensure that any work enqueued on the shared_experts_stream is
+        # completed before the shared_output tensor is consumed
+        if (
+            self.shared_experts is not None
+            and use_shared_experts_stream
+            and self.shared_experts_stream is not None
+            and hidden_states.is_cuda
+            and current_platform.is_cuda()
+        ):
+            torch.cuda.current_stream().wait_stream(self.shared_experts_stream)
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1183,6 +1287,10 @@ def forward(
         else:
             output = torch.zeros_like(hidden_states)
 
+        use_shared_experts_stream, hidden_states_clone = (
+            self._maybe_setup_shared_experts_stream(hidden_states)
+        )
+
         local_num_experts = w1.size(0)
         if global_num_experts == -1:
             global_num_experts = local_num_experts
@@ -1219,4 +1327,6 @@ def forward(
             topk_weights,
             topk_ids,
             apply_router_weight_on_input,
+            hidden_states_clone=hidden_states_clone,
+            use_shared_experts_stream=use_shared_experts_stream,
         )
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
index 9bb976fb9ec9..e27e2eb32da0 100644
--- a/vllm/model_executor/layers/fused_moe/prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -45,7 +45,8 @@ def prepare(
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1"
             )
-            a1.mul_(topk_weights.to(a1.dtype))
+            # Note: do not use inplace for shared experts overlap
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         a1q, a1q_scale = moe_kernel_quantize_input(
             a1,
diff --git a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
index 6ec8b33ed930..9aaeec4f98a6 100644
--- a/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/shared_fused_moe.py
@@ -38,7 +38,6 @@ def __init__(
                 # TODO(wentao): find the root cause and remove this condition
                 self.enable_eplb
                 or (self.moe_config.use_flashinfer_cutlass_kernels and self.dp_size > 1)
-                or self.use_marlin_kernels
             )
             and self._shared_experts is not None
         )
diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
index ce56887f1c26..48e5a8907f92 100644
--- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
+++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@@ -108,11 +108,14 @@ def supports_eplb(self) -> bool:
     def allow_inplace(self) -> bool:
         return True
 
-    def maybe_make_prepare_finalize(self) -> FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> FusedMoEPrepareAndFinalize | None:
         if self.rocm_aiter_moe_enabled:
             return None
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -260,7 +263,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     layer.w2_weight.copy_(packed_w2_weight)
                     layer.cpu_fused_moe = cpu_fused_moe.SGLFusedMOE(layer)
                 else:
-                    layer.cpu_fused_moe = cpu_fused_moe.IPEXFusedMOE(layer)
+                    layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
             else:
                 layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
 
@@ -328,7 +331,7 @@ def get_fused_moe_quant_config(
 
     def forward_cuda(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -349,31 +352,9 @@ def forward_cuda(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
         topk_weights, topk_ids, zero_expert_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -412,7 +393,7 @@ def forward_cuda(
                 expert_map=expert_map,
             )
 
-        if zero_expert_num != 0 and zero_expert_type is not None:
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
@@ -422,7 +403,7 @@ def forward_cuda(
 
     def forward_cpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -471,7 +452,7 @@ def forward_cpu(
 
     def forward_xpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
@@ -512,7 +493,7 @@ def forward_xpu(
 
     def forward_tpu(
         self,
-        layer: torch.nn.Module,
+        layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
         x: torch.Tensor,
         use_grouped_topk: bool,
         top_k: int,
diff --git a/vllm/model_executor/layers/kda.py b/vllm/model_executor/layers/kda.py
index 26458f2e3c4d..27cc3884517f 100644
--- a/vllm/model_executor/layers/kda.py
+++ b/vllm/model_executor/layers/kda.py
@@ -5,7 +5,6 @@
 from einops import rearrange
 from torch import nn
 
-from vllm.attention import AttentionBackend
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -44,7 +43,6 @@ def kda_attention(
     k_proj_states: torch.Tensor,
     v_proj_states: torch.Tensor,
     g1: torch.Tensor,
-    g2: torch.Tensor,
     beta: torch.Tensor,
     core_attn_out: torch.Tensor,
     layer_name: str,
@@ -56,7 +54,6 @@ def kda_attention(
         k_proj_states=k_proj_states,
         v_proj_states=v_proj_states,
         g1=g1,
-        g2=g2,
         beta=beta,
         core_attn_out=core_attn_out,
     )
@@ -67,7 +64,6 @@ def kda_attention_fake(
     k_proj_states: torch.Tensor,
     v_proj_states: torch.Tensor,
     g1: torch.Tensor,
-    g2: torch.Tensor,
     beta: torch.Tensor,
     core_attn_out: torch.Tensor,
     layer_name: str,
@@ -86,12 +82,7 @@ def kda_attention_fake(
 class KimiDeltaAttention(nn.Module, MambaBase):
     @property
     def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
-
-        return GDNAttentionBackend
+        return "gdn_attention"
 
     def get_state_dtype(
         self,
@@ -284,7 +275,6 @@ def forward(
             k,
             v,
             g1,
-            g2,
             beta,
             core_attn_out,
             self.prefix,
@@ -299,7 +289,6 @@ def _forward(
         k_proj_states: torch.Tensor,
         v_proj_states: torch.Tensor,
         g1: torch.Tensor,
-        g2: torch.Tensor,
         beta: torch.Tensor,
         core_attn_out: torch.Tensor,
     ) -> None:
@@ -316,8 +305,15 @@ def _forward(
         has_initial_state = attn_metadata.has_initial_state
         non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
         non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor  # noqa: E501
+        num_actual_tokens = attn_metadata.num_actual_tokens
         constant_caches = self.kv_cache[forward_context.virtual_engine]
 
+        q_proj_states = q_proj_states[:num_actual_tokens]
+        k_proj_states = k_proj_states[:num_actual_tokens]
+        v_proj_states = v_proj_states[:num_actual_tokens]
+        g1 = g1[:num_actual_tokens]
+        beta = beta[:num_actual_tokens]
+
         (conv_state_q, conv_state_k, conv_state_v, recurrent_state) = constant_caches
         # deal with strides
         conv_state_q = conv_state_q.transpose(-1, -2)
@@ -372,7 +368,7 @@ def _forward(
             ).transpose(0, 1)
         else:
             decode_conv_indices = non_spec_state_indices_tensor[
-                : attn_metadata.num_decodes
+                : attn_metadata.num_actual_tokens
             ]
             q = causal_conv1d_update(
                 q_proj_states,
@@ -438,8 +434,9 @@ def _forward(
                 beta=beta,
                 initial_state=recurrent_state,
                 use_qk_l2norm_in_kernel=True,
-                cu_seqlens=non_spec_query_start_loc,
+                cu_seqlens=non_spec_query_start_loc[: attn_metadata.num_decodes + 1],
                 ssm_state_indices=non_spec_state_indices_tensor,
             )
-        assert core_attn_out_non_spec.shape == core_attn_out.shape
-        core_attn_out[:] = core_attn_out_non_spec
+        core_attn_out[0, :num_actual_tokens] = core_attn_out_non_spec[
+            0, :num_actual_tokens
+        ]
diff --git a/vllm/model_executor/layers/lightning_attn.py b/vllm/model_executor/layers/lightning_attn.py
index 99853680eac6..ffccdc12241c 100644
--- a/vllm/model_executor/layers/lightning_attn.py
+++ b/vllm/model_executor/layers/lightning_attn.py
@@ -198,7 +198,7 @@ def _fwd_kv_parallel(
     )
 
     # Load the decay factors for the current head and block
-    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)[None, :]
+    k_decay_ptr = K_decay + off_h * BLOCK + tl.arange(0, CBLOCK)
 
     kv_index = tl.arange(0, CBLOCK)
 
@@ -228,6 +228,12 @@ def _fwd_kv_parallel(
 
         # Load decay factor and compute weighted key-value outer product
         k_decay = tl.load(k_decay_ptr)
+
+        # NOTE: Need to add the extra dim here due to AMD MLIR lowering error.
+        # Please don't move it back until issue is resolved.
+        # Issue: https://github.com/ROCm/triton/issues/907
+        k_decay = k_decay[None, :]
+
         kv += tl.dot(k_trans * k_decay, v)
 
         # Move to the next sub-block
diff --git a/vllm/model_executor/layers/mamba/abstract.py b/vllm/model_executor/layers/mamba/abstract.py
index e68b09b4d81f..74f4383e9c23 100644
--- a/vllm/model_executor/layers/mamba/abstract.py
+++ b/vllm/model_executor/layers/mamba/abstract.py
@@ -2,17 +2,15 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import abstractmethod
 from collections.abc import Iterable
-from typing import TYPE_CHECKING
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.selector import get_mamba_attn_backend
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.v1.kv_cache_interface import KVCacheSpec, MambaSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class MambaBase(AttentionLayerBase):
     """
@@ -38,11 +36,6 @@ def get_state_shape(self) -> Iterable[tuple[int, ...]]:
     def mamba_type(self) -> str:
         pass
 
-    @abstractmethod
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        """Get the attention backend class for this Mamba layer."""
-        pass
-
     @abstractmethod
     def get_state_dtype(self) -> tuple[torch.dtype, ...]:
         pass
@@ -69,3 +62,7 @@ def get_kv_cache_spec(self, vllm_config: VllmConfig) -> KVCacheSpec | None:
                 else 0
             ),
         )
+
+    def get_attn_backend(self) -> type[AttentionBackend]:
+        """Get the attention backend class for this Mamba layer."""
+        return get_mamba_attn_backend(self.mamba_type)
diff --git a/vllm/model_executor/layers/mamba/linear_attn.py b/vllm/model_executor/layers/mamba/linear_attn.py
index 0a2742ff49a4..278713408c28 100644
--- a/vllm/model_executor/layers/mamba/linear_attn.py
+++ b/vllm/model_executor/layers/mamba/linear_attn.py
@@ -2,19 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import math
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
-from typing import TYPE_CHECKING
 
 import torch
 import torch.nn.functional as F
 from einops import rearrange
 from torch import nn
 
-from vllm.attention import AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, get_current_vllm_config
 from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
 from vllm.distributed.parallel_state import (
@@ -37,9 +31,6 @@
 from vllm.utils.torch_utils import direct_register_custom_op
 from vllm.v1.attention.backends.linear_attn import LinearAttentionMetadata
 
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
-
 
 class MiniMaxText01RMSNormTP(CustomOp):
     name = "MiniMaxText01RMSNormTP"
@@ -123,11 +114,6 @@ class MiniMaxText01LinearAttention(nn.Module, MambaBase):
     def mamba_type(self) -> str:
         return "linear_attention"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.linear_attn import LinearAttentionBackend
-
-        return LinearAttentionBackend
-
     def get_state_dtype(self) -> tuple[torch.dtype]:
         assert self.model_config is not None
         assert self.cache_config is not None
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py
index b6345b8af7f0..90e520e24441 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer.py
@@ -1,10 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING, NamedTuple
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
+from typing import NamedTuple
 
 import torch
 from torch import nn
@@ -452,11 +449,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "mamba1"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba1_attn import Mamba1AttentionBackend
-
-        return Mamba1AttentionBackend
-
     def _time_proj_bias(self) -> torch.Tensor | None:
         if hasattr(self.dt_proj, "bias") and self.dt_proj.bias is not None:
             return self.dt_proj.bias.float()
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index fb45afa33dad..0ea5805305ed 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -426,6 +422,10 @@ def __init__(
         # `ColumnParallelLinear` and `MergedColumnParallelLinear`,
         # and `set_weight_attrs` doesn't allow to override it
         self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
+        conv_weights = self.conv1d.weight.view(
+            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
+        )
+        self.register_buffer("conv_weights", conv_weights, persistent=False)
 
         # - these are TPed by heads to reduce the size of the
         #   temporal shape
@@ -459,6 +459,17 @@ def __init__(
             intermediate_size, n_groups, self.use_rms_norm, eps=rms_norm_eps
         )
 
+        # - get hidden_states, B and C after depthwise convolution.
+        self.split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
+            hidden_states_B_C,
+            [
+                self.intermediate_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+            ],
+            dim=-1,
+        )
+
         compilation_config = get_current_vllm_config().compilation_config
         if prefix in compilation_config.static_forward_context:
             raise ValueError(f"Duplicate layer name: {prefix}")
@@ -470,10 +481,24 @@ def __init__(
         self.cache_config = cache_config
         self.prefix = prefix
 
+        # Pre-compute sizes for forward pass
+        self.tped_intermediate_size = self.intermediate_size // self.tp_size
+        self.tped_conv_size = self.conv_dim // self.tp_size
+        self.tped_dt_size = self.num_heads // self.tp_size
+
+        self.split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
+            hidden_states_B_C,
+            [
+                self.tped_intermediate_size,
+                self.groups_ssm_state_size // self.tp_size,
+                self.groups_ssm_state_size // self.tp_size,
+            ],
+            dim=-1,
+        )
+
     def forward_native(
         self,
         hidden_states: torch.Tensor,
-        output: torch.Tensor,
         mup_vector: torch.Tensor | None = None,
     ):
         pass
@@ -481,22 +506,55 @@ def forward_native(
     def forward(
         self,
         hidden_states: torch.Tensor,
-        output: torch.Tensor,
         mup_vector: torch.Tensor | None = None,
     ):
+        # 1. Gated MLP's linear projection
+        projected_states, _ = self.in_proj(hidden_states)
+        if mup_vector is not None:
+            projected_states = projected_states * mup_vector
+
+        # 2. Prepare inputs for conv + SSM
+        ssm_output = torch.empty(
+            [
+                hidden_states.shape[0],
+                (self.num_heads // self.tp_size) * self.head_dim,
+            ],
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+
+        # 3. conv + SSM
+        # (split `projected_states` into hidden_states_B_C, dt in the custom op to
+        # ensure it is not treated as an intermediate tensor by torch compile)
         torch.ops.vllm.mamba_mixer2(
-            hidden_states,
-            output,
+            projected_states,
+            ssm_output,
             self.prefix,
-            mup_vector,
         )
 
-    def forward_cuda(
+        # 4. gated MLP
+        # GatedRMSNorm internally applying SiLU to the gate
+        # SiLU is applied internally before normalization, unlike standard
+        # norm usage
+        gate = projected_states[..., : self.tped_intermediate_size]
+        hidden_states = self.norm(ssm_output, gate)
+
+        # 5. Final linear projection
+        output, _ = self.out_proj(hidden_states)
+
+        return output
+
+    def conv_ssm_forward(
         self,
-        hidden_states: torch.Tensor,
+        projected_states: torch.Tensor,
         output: torch.Tensor,
-        mup_vector: torch.Tensor | None = None,
     ):
+        hidden_states_B_C, dt = torch.split(
+            projected_states[..., self.tped_intermediate_size :],
+            [self.tped_conv_size, self.tped_dt_size],
+            dim=-1,
+        )
+
         forward_context = get_forward_context()
         # attn_metadata contains metadata necessary for the mamba2 triton
         # kernels to operate in continuous batching and in chunked prefill
@@ -524,48 +582,14 @@ def forward_cuda(
             cu_chunk_seqlen_p = attn_metadata.cu_chunk_seqlen_p
             last_chunk_indices_p = attn_metadata.last_chunk_indices_p
 
-        # 1. Gated MLP's linear projection
-        projected_states, _ = self.in_proj(hidden_states)
-
-        if mup_vector is not None:
-            projected_states = projected_states * mup_vector
-
-        gate, hidden_states_B_C, dt = torch.split(
-            projected_states,
-            [
-                self.intermediate_size // self.tp_size,
-                self.conv_dim // self.tp_size,
-                self.num_heads // self.tp_size,
-            ],
-            dim=-1,
-        )
-
-        conv_weights = self.conv1d.weight.view(
-            self.conv1d.weight.size(0), self.conv1d.weight.size(2)
-        )
-
-        # - get hidden_states, B and C after depthwise convolution.
-        split_hidden_states_B_C_fn = lambda hidden_states_B_C: torch.split(
-            hidden_states_B_C,
-            [
-                self.intermediate_size // self.tp_size,
-                self.groups_ssm_state_size // self.tp_size,
-                self.groups_ssm_state_size // self.tp_size,
-            ],
-            dim=-1,
-        )
-
         if attn_metadata is None:
             # profile run
             hidden_states_B_C = (
                 hidden_states_B_C.transpose(0, 1).clone().transpose(0, 1)
             ).contiguous()
-            hidden_states, _B, _C = split_hidden_states_B_C_fn(hidden_states_B_C)
-            hidden_states = self.norm(hidden_states, gate)
-            out, _ = self.out_proj(hidden_states)
-            return out
+            hidden_states, _B, _C = self.split_hidden_states_B_C_fn(hidden_states_B_C)
+            return hidden_states
 
-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         num_prefills = attn_metadata.num_prefills  # request count
         num_decodes = attn_metadata.num_decode_tokens  # token count (=request)
         num_prefill_tokens = attn_metadata.num_prefill_tokens  # token count
@@ -622,18 +646,8 @@ def forward_cuda(
             block_idx_first_scheduled_token_p = None
             num_computed_tokens_p = None
 
-        # Preallocate output tensor to avoid memcpy cost for merging prefill
-        # and decode outputs
-        preallocated_ssm_out = torch.empty(
-            [
-                num_prefill_tokens + num_decodes,
-                (self.num_heads // self.tp_size) * self.head_dim,
-            ],
-            dtype=hidden_states.dtype,
-            device=hidden_states.device,
-        )
         preallocated_ssm_out_d, preallocated_ssm_out_p = torch.split(
-            preallocated_ssm_out,
+            output[:num_actual_tokens],
             [num_decodes, num_prefill_tokens],
             dim=0,
         )
@@ -658,7 +672,7 @@ def forward_cuda(
             )  # this is the form that causal-conv see
             hidden_states_B_C_p = causal_conv1d_fn(
                 x,
-                conv_weights,
+                self.conv_weights,
                 self.conv1d.bias,
                 activation=self.activation,
                 conv_states=conv_state,
@@ -673,7 +687,9 @@ def forward_cuda(
                 query_start_loc=query_start_loc_p,
             ).transpose(0, 1)[:num_prefill_tokens]
 
-            hidden_states_p, B_p, C_p = split_hidden_states_B_C_fn(hidden_states_B_C_p)
+            hidden_states_p, B_p, C_p = self.split_hidden_states_B_C_fn(
+                hidden_states_B_C_p
+            )
 
             # 3. State Space Model sequence transformation
             initial_states = None
@@ -815,7 +831,7 @@ def forward_cuda(
             hidden_states_B_C_d = causal_conv1d_update(
                 hidden_states_B_C_d,
                 conv_state,
-                conv_weights,
+                self.conv_weights,
                 self.conv1d.bias,
                 self.activation,
                 conv_state_indices=state_indices_tensor_d,
@@ -823,7 +839,9 @@ def forward_cuda(
                 initial_state_idx=block_idx_last_computed_token_d,
             )
 
-            hidden_states_d, B_d, C_d = split_hidden_states_B_C_fn(hidden_states_B_C_d)
+            hidden_states_d, B_d, C_d = self.split_hidden_states_B_C_fn(
+                hidden_states_B_C_d
+            )
 
             # 3. State Space Model sequence transformation
             n_groups = self.n_groups // self.tp_size
@@ -861,15 +879,6 @@ def forward_cuda(
                 out=preallocated_ssm_out_d.view(num_decodes, -1, self.head_dim),
             )
 
-        # 4. gated MLP
-        # GatedRMSNorm internally applying SiLU to the gate
-        # SiLU is applied internally before normalization, unlike standard
-        # norm usage
-        hidden_states = self.norm(preallocated_ssm_out, gate[:num_actual_tokens])
-
-        # 5. Final linear projection
-        output[:num_actual_tokens], _ = self.out_proj(hidden_states)
-
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         assert self.model_config is not None
         assert self.cache_config is not None
@@ -894,28 +903,21 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "mamba2"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-
-        return Mamba2AttentionBackend
-
 
 def mamba_mixer2(
-    hidden_states: torch.Tensor,
+    projected_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: torch.Tensor | None = None,
 ) -> None:
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
-    self.forward_cuda(hidden_states=hidden_states, output=output, mup_vector=mup_vector)
+    self.conv_ssm_forward(projected_states=projected_states, output=output)
 
 
 def mamba_mixer2_fake(
-    hidden_states: torch.Tensor,
+    projected_states: torch.Tensor,
     output: torch.Tensor,
     layer_name: str,
-    mup_vector: torch.Tensor | None = None,
 ) -> None:
     return
 
diff --git a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
index e5a5c9dd6f71..661c884627b0 100644
--- a/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
+++ b/vllm/model_executor/layers/mamba/ops/ssd_chunk_scan.py
@@ -245,7 +245,7 @@ def _chunk_scan_fwd_kernel(
             )
             if not HAS_INITSTATES and (seq_idx != seq_idx_prev):
                 prev_states = tl.zeros(
-                    (BLOCK_SIZE_DSTATE, BLOCK_SIZE_K), dtype=C_ptr.dtype.element_ty
+                    (BLOCK_SIZE_K, BLOCK_SIZE_N), dtype=C_ptr.dtype.element_ty
                 )
             else:
                 prev_states = tl.load(
diff --git a/vllm/model_executor/layers/mamba/short_conv.py b/vllm/model_executor/layers/mamba/short_conv.py
index 04efa8a8b373..0bbad17d7ebc 100644
--- a/vllm/model_executor/layers/mamba/short_conv.py
+++ b/vllm/model_executor/layers/mamba/short_conv.py
@@ -1,10 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 
@@ -232,11 +228,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "short_conv"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.short_conv_attn import ShortConvAttentionBackend
-
-        return ShortConvAttentionBackend
-
 
 def short_conv(
     hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/layers/mla.py b/vllm/model_executor/layers/mla.py
index c4c44b83ae6b..6ebfa47a9dc3 100644
--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -24,6 +24,7 @@ class MLAModules:
     q_b_proj: torch.nn.Module | None
     q_proj: torch.nn.Module | None
     indexer: torch.nn.Module | None
+    indexer_rotary_emb: torch.nn.Module | None
     is_sparse: bool
     topk_indices_buffer: torch.Tensor | None
 
@@ -80,6 +81,7 @@ def __init__(
         self.rotary_emb = mla_modules.rotary_emb
         self.o_proj = mla_modules.o_proj
         self.indexer = mla_modules.indexer
+        self.indexer_rope_emb = mla_modules.indexer_rotary_emb
         self.is_sparse = mla_modules.is_sparse
 
         if self.indexer is not None:
@@ -153,7 +155,9 @@ def forward_native(
             )
 
         if self.indexer and self.is_sparse:
-            _topk_indices = self.indexer(hidden_states, q_c, positions, self.rotary_emb)
+            _topk_indices = self.indexer(
+                hidden_states, q_c, positions, self.indexer_rope_emb
+            )
 
         attn_out = self.mla_attn(
             q,
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index b92fb8d266b7..18aaae394f93 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -3,8 +3,11 @@
 
 from typing import Literal, get_args
 
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
+logger = init_logger(__name__)
+
 QuantizationMethods = Literal[
     "awq",
     "deepspeedfp",
@@ -35,6 +38,8 @@
     "inc",
     "mxfp4",
     "petit_nvfp4",
+    "cpu_gptq",
+    "cpu_awq",
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
@@ -70,15 +75,20 @@ def register_quantization_config(quantization: str):
 
     def _wrapper(quant_config_cls):
         if quantization in QUANTIZATION_METHODS:
-            raise ValueError(
-                f"The quantization method `{quantization}` is already exists."
+            logger.warning(
+                "The quantization method '%s' already exists and will be "
+                "overwritten by the quantization config %s.",
+                quantization,
+                quant_config_cls,
             )
+        else:
+            QUANTIZATION_METHODS.append(quantization)
+
         if not issubclass(quant_config_cls, QuantizationConfig):
             raise ValueError(
                 "The quantization config must be a subclass of `QuantizationConfig`."
             )
         _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls
-        QUANTIZATION_METHODS.append(quantization)
         return quant_config_cls
 
     return _wrapper
@@ -99,6 +109,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .compressed_tensors.compressed_tensors import (
         CompressedTensorsConfig,
     )
+    from .cpu_wna16 import CPUAWQConfig, CPUGPTQConfig
     from .deepspeedfp import DeepSpeedFPConfig
     from .experts_int8 import ExpertsInt8Config
     from .fbgemm_fp8 import FBGEMMFp8Config
@@ -151,6 +162,8 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "inc": INCConfig,
         "mxfp4": Mxfp4Config,
         "petit_nvfp4": PetitNvFp4Config,
+        "cpu_gptq": CPUGPTQConfig,
+        "cpu_awq": CPUAWQConfig,
     }
     # Update the `method_to_config` with customized quantization methods.
     method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG)
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index f1943d461187..95e4382c89d7 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -266,7 +266,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
             from vllm.model_executor.layers.quantization.awq_marlin import (
                 AWQMarlinConfig,
                 AWQMarlinLinearMethod,
-                AWQMoEMethod,
+                AWQMarlinMoEMethod,
             )
 
             quant_args_marlin = AWQMarlinConfig(
@@ -291,7 +291,7 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
 
         if isinstance(layer, FusedMoE):
             if use_marlin:
-                return AWQMoEMethod(quant_args_marlin, layer.moe_config)
+                return AWQMarlinMoEMethod(quant_args_marlin, layer.moe)
             from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 
             config = {
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 0cf8b69f9f6b..ab68c5dca52c 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -106,7 +106,7 @@ def get_quant_method(
             return AWQLinearMethod(self)
         elif isinstance(layer, FusedMoE):
             # Lazy import to avoid circular import.
-            from .awq_marlin import AWQMarlinConfig, AWQMoEMethod
+            from .awq_marlin import AWQMarlinConfig, AWQMarlinMoEMethod
             from .moe_wna16 import MoeWNA16Config
             from .utils.marlin_utils import check_moe_marlin_supports_layer
 
@@ -136,7 +136,7 @@ def get_quant_method(
             awq_marlin_config = AWQMarlinConfig.from_config(
                 marlin_compatible_config_dict
             )
-            return AWQMoEMethod(awq_marlin_config, layer.moe_config)
+            return AWQMarlinMoEMethod(awq_marlin_config, layer.moe_config)
         return None
 
     def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index 3f6ea68072b4..d463e181fd2d 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -40,6 +40,8 @@
     check_marlin_supported,
     check_marlin_supports_layer,
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
@@ -69,7 +71,6 @@ class AWQMarlinConfig(QuantizationConfig):
     # num_bits -> type
     TYPE_MAP = {
         4: scalar_types.uint4,
-        8: scalar_types.uint8,
     }
 
     def __init__(
@@ -193,7 +194,9 @@ def get_quant_method(
                 return AWQConfig.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return AWQMarlinLinearMethod(self)
+            quant_method = AWQMarlinLinearMethod(self)
+            quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
             from vllm.model_executor.layers.quantization.moe_wna16 import MoeWNA16Config
 
@@ -211,7 +214,9 @@ def get_quant_method(
                 return MoeWNA16Config.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return AWQMoEMethod(self, layer.moe_config)
+            moe_quant_method = AWQMarlinMoEMethod(self, layer.moe_config)
+            moe_quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
         return None
 
     @classmethod
@@ -270,6 +275,8 @@ class AWQMarlinLinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: AWQMarlinConfig) -> None:
         self.quant_config = quant_config
+        self.quant_type = scalar_types.uint4
+        self.input_dtype = None
 
     def create_weights(
         self,
@@ -312,6 +319,7 @@ def create_weights(
         )
 
         num_groups = input_size_per_partition // group_size
+        layer.num_groups = num_groups
 
         qzeros = PackedvLLMParameter(
             data=torch.empty(
@@ -358,12 +366,19 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # Allocate marlin workspace
         layer.workspace = marlin_make_workspace_new(device)
 
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(layer.qweight, layer.qzeros, inplace=True)
+            layer.scales.data = layer.scales.data * 512
+
         # Repack weights from AWQ format to marlin format.
         marlin_qweight = ops.awq_marlin_repack(
             layer.qweight,
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
             num_bits=self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "qweight", marlin_qweight)
 
@@ -373,7 +388,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.input_size_per_partition,
             size_n=layer.output_size_per_partition,
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups > 1:
+            marlin_scales, input_global_scale = marlin_act_int8_process_scales(
+                marlin_scales
+            )
+            layer.register_parameter(
+                "input_global_scale", Parameter(input_global_scale, requires_grad=False)
+            )
+
         replace_parameter(layer, "scales", marlin_scales)
 
         # Permute zero-points from AWQ format to marlin format.
@@ -382,6 +406,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.num_groups,
             size_n=layer.output_size_per_partition,
             num_bits=self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "qzeros", marlin_zp)
 
@@ -409,11 +434,13 @@ def apply(
             quant_type=self.quant_config.quant_type,
             output_size_per_partition=layer.output_size_per_partition,
             input_size_per_partition=layer.input_size_per_partition,
+            input_global_scale=getattr(layer, "input_global_scale", None),
             bias=bias,
+            input_dtype=self.input_dtype,
         )
 
 
-class AWQMoEMethod(FusedMoEMethodBase):
+class AWQMarlinMoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: AWQMarlinConfig,
@@ -422,8 +449,9 @@ def __init__(
         super().__init__(moe)
         self.quant_config = quant_config
         if self.quant_config.weight_bits != 4:
-            raise ValueError("AWQMoEMethod only supports 4bit now.")
+            raise ValueError("AWQMarlinMoEMethod only supports 4bit now.")
         self.quant_type = scalar_types.uint4
+        self.input_dtype = None
         self.use_marlin = True
 
     def create_weights(
@@ -435,6 +463,7 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        layer.input_dtype = self.input_dtype
         extra_weight_attrs.update(
             {
                 "is_transposed": True,
@@ -468,6 +497,8 @@ def create_weights(
 
         num_groups_w13 = hidden_size // self.quant_config.group_size
         num_groups_w2 = intermediate_size_per_partition // self.quant_config.group_size
+        layer.num_groups_w13 = num_groups_w13
+        layer.num_groups_w2 = num_groups_w2
 
         # WEIGHT_SCALES
         # Allocate 2 scales for w1 and w3 respectively.
@@ -522,6 +553,21 @@ def create_weights(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_qweight.shape[0]
         device = layer.w13_qweight.device
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(
+                layer.w13_qweight.view(-1, layer.w13_qweight.size(2)),
+                layer.w13_qzeros.view(-1, layer.w13_qzeros.size(2)),
+                inplace=True,
+            )
+            ops.marlin_int4_fp8_preprocess(
+                layer.w2_qweight.view(-1, layer.w2_qweight.size(2)),
+                layer.w2_qzeros.view(-1, layer.w2_qzeros.size(2)),
+                inplace=True,
+            )
+            layer.w13_scales.data = layer.w13_scales.data * 512
+            layer.w2_scales.data = layer.w2_scales.data * 512
 
         layer.w13_g_idx_sort_indices = torch.nn.Parameter(
             torch.empty((num_experts, 0), dtype=torch.int32, device=device),
@@ -538,6 +584,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w13_qweight.shape[1],
             size_n=layer.w13_qweight.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
 
@@ -547,6 +594,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w2_qweight.shape[1],
             size_n=layer.w2_qweight.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
 
@@ -556,7 +604,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                Parameter(w13_input_global_scale, requires_grad=False),
+            )
 
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
 
@@ -565,7 +622,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                Parameter(w2_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
         marlin_w13_zp = moe_awq_to_marlin_zero_points(
@@ -573,6 +640,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w13_qzeros.shape[1],
             size_n=layer.w13_qzeros.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qzeros", marlin_w13_zp)
 
@@ -581,6 +649,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.w2_qzeros.shape[1],
             size_n=layer.w2_qzeros.shape[2] * self.quant_config.pack_factor,
             num_bits=self.quant_config.weight_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qzeros", marlin_w2_zp)
 
@@ -597,7 +666,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -618,24 +687,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `AWQMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
@@ -649,6 +705,8 @@ def apply(
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -656,4 +714,5 @@ def apply(
             w1_zeros=layer.w13_qzeros,
             w2_zeros=layer.w2_qzeros,
             workspace=layer.workspace,
+            input_dtype=self.input_dtype,
         )
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index e5a741e639ad..1e57fa218b79 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -495,7 +495,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -518,25 +518,11 @@ def apply(
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `BitsAndBytesMoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
+        # TODO(bnell): Do these need to be called on the hot path?
         if self.quant_config.load_in_8bit:
             w13, w2 = self._apply_8bit_dequant(layer)
         else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 6c7d4cd7bd9a..02086c3c0052 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -18,6 +18,7 @@
 from compressed_tensors.transform import TransformConfig
 
 import vllm.envs as envs
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -131,8 +132,6 @@ def get_quant_method(
         layer: torch.nn.Module,
         prefix: str,
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             # collect schemes
             quant_scheme = self.get_scheme(layer=layer, layer_name=prefix)
@@ -158,9 +157,25 @@ def get_quant_method(
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
         if isinstance(layer, FusedMoE):
-            return CompressedTensorsMoEMethod.get_moe_method(self, layer)
+            return CompressedTensorsMoEMethod.get_moe_method(
+                self, layer, layer_name=prefix
+            )
         return None
 
+    def _add_fused_moe_to_target_scheme_map(self):
+        """
+        Helper function to update target_scheme_map
+        since linear layers get fused into FusedMoE
+        targetting 'Linear' needs to also match
+        FusedMoE modules.
+        """
+        if (
+            "Linear" not in self.target_scheme_map
+            or "FusedMoE" in self.target_scheme_map
+        ):
+            return
+        self.target_scheme_map["FusedMoE"] = self.target_scheme_map["Linear"]
+
     @classmethod
     def from_config(cls, config: dict[str, Any]) -> "CompressedTensorsConfig":
         ignore: list[str] = cast(list[str], config.get("ignore", []))
@@ -266,8 +281,9 @@ def _quantization_scheme_map_from_config(
     def get_config_filenames(cls) -> list[str]:
         return []
 
+    @staticmethod
     def _check_scheme_supported(
-        self, min_capability: int, error: bool = True, match_exact: bool = False
+        min_capability: int, error: bool = True, match_exact: bool = False
     ) -> bool:
         capability_tuple = current_platform.get_device_capability()
 
@@ -293,9 +309,8 @@ def _check_scheme_supported(
         else:
             return False
 
-    def _is_fp4a4_nvfp4(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
-    ):
+    @staticmethod
+    def _is_fp4a4_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
         if weight_quant is None or input_quant is None:
             return False
 
@@ -322,9 +337,8 @@ def _is_fp4a4_nvfp4(
             and is_symmetric
         )
 
-    def _is_fp4a16_nvfp4(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
-    ):
+    @staticmethod
+    def _is_fp4a16_nvfp4(weight_quant: QuantizationArgs, input_quant: QuantizationArgs):
         is_weight_only = weight_quant is not None and input_quant is None
         is_tensor_group_quant = (
             weight_quant.strategy == QuantizationStrategy.TENSOR_GROUP.value
@@ -344,8 +358,9 @@ def _is_fp4a16_nvfp4(
             and is_symmetric
         )
 
+    @staticmethod
     def _is_static_tensor_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
@@ -362,8 +377,9 @@ def _is_static_tensor_w8a8(
         # Only symmetric weight quantization supported.
         return is_8_bits and is_tensor and weight_quant.symmetric and is_static
 
+    @staticmethod
     def _is_dynamic_token_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
         weight_strategy = (
@@ -379,8 +395,9 @@ def _is_dynamic_token_w8a8(
         # Only symmetric weight quantization supported.
         return is_8_bits and is_token and weight_quant.symmetric and is_dynamic
 
+    @staticmethod
     def _is_dynamic_token_w4a8_int(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         is_weight_4_bits = weight_quant.num_bits == 4
         is_activation_8_bits = input_quant.num_bits == 8
@@ -403,8 +420,9 @@ def _is_dynamic_token_w4a8_int(
             and is_dynamic
         )
 
+    @staticmethod
     def _is_fp8_w8a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         # Confirm weights and activations quantized.
         if weight_quant is None or input_quant is None:
@@ -439,8 +457,9 @@ def _is_fp8_w8a8(
         is_per_tensor_activation = input_quant.strategy == QuantizationStrategy.TENSOR
         return is_symmetric_activation and is_per_tensor_activation
 
+    @staticmethod
     def _is_fp8_w4a8(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         if not weight_quant or not input_quant:
             return False
@@ -462,29 +481,33 @@ def _is_fp8_w4a8(
             and is_dynamic
         )
 
+    @classmethod
     def _is_fp8_w4a8_sm90(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             90, error=False, match_exact=True
-        ) and self._is_fp8_w4a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w4a8(weight_quant, input_quant)
 
+    @classmethod
     def _is_fp8_w8a8_sm90(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             90, error=False, match_exact=True
-        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w8a8(weight_quant, input_quant)
 
+    @classmethod
     def _is_fp8_w8a8_sm100(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        cls, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
-        return self._check_scheme_supported(
+        return cls._check_scheme_supported(
             100, error=False, match_exact=True
-        ) and self._is_fp8_w8a8(weight_quant, input_quant)
+        ) and cls._is_fp8_w8a8(weight_quant, input_quant)
 
+    @staticmethod
     def _is_fp8_w8a16(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         # Confirm weights quantized.
         if weight_quant is None:
@@ -508,8 +531,9 @@ def _is_fp8_w8a16(
             and is_tensor_or_channel_or_block_weight
         )
 
+    @staticmethod
     def _is_wNa16_group_channel(
-        self, weight_quant: QuantizationArgs, input_quant: QuantizationArgs
+        weight_quant: QuantizationArgs, input_quant: QuantizationArgs
     ) -> bool:
         input_quant_none = input_quant is None
         is_channel_group = (
@@ -525,6 +549,7 @@ def _get_scheme_from_parts(
         weight_quant: QuantizationArgs,
         input_quant: QuantizationArgs,
         format: str | None = None,
+        layer_name: str | None = None,
     ) -> "CompressedTensorsScheme":
         # use the per-layer format if defined, otherwise, use global format
         format = format if format is not None else self.quant_format
@@ -563,6 +588,7 @@ def _get_scheme_from_parts(
                     symmetric=weight_quant.symmetric,
                     group_size=weight_quant.group_size,
                     actorder=weight_quant.actorder,
+                    layer_name=layer_name,
                 )
 
         act_quant_format = is_activation_quantization_format(format)
@@ -646,25 +672,13 @@ def get_scheme(
         to select the CompressedTensorsScheme used for inference.
         """
 
-        # Find the "target" in the compressed-tensors config
-        # that our layer conforms to.
-        # TODO (@kylesayrs): support ignore module names with ct matching utils
-        if should_ignore_layer(
-            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
-        ):
-            return None
-
-        # Will be empty for models with only sparsity
-        weight_quant = input_quant = None
-        if self.target_scheme_map:
-            matched_target = find_matched_target(
-                layer_name=layer_name,
-                module=layer,
-                targets=self.target_scheme_map.keys(),
-                fused_mapping=self.packed_modules_mapping,
-            )
+        # Use the new get_quant_args method to extract QuantizationArgs
+        scheme_dict = self.get_scheme_dict(layer, layer_name)
 
-            scheme_dict = self.target_scheme_map[matched_target]
+        weight_quant = None
+        input_quant = None
+        format = None
+        if scheme_dict:
             weight_quant = scheme_dict.get("weights")
             input_quant = scheme_dict.get("input_activations")
             format = scheme_dict.get("format")
@@ -714,7 +728,10 @@ def get_scheme(
         else:
             # Find the quant_scheme
             scheme = self._get_scheme_from_parts(  # type: ignore
-                weight_quant=weight_quant, input_quant=input_quant, format=format
+                weight_quant=weight_quant,
+                input_quant=input_quant,
+                format=format,
+                layer_name=layer_name,
             )
 
         # Raise error if device does not support the scheme
@@ -723,6 +740,38 @@ def get_scheme(
         logger.debug("Using scheme: %s for %s", scheme.__class__.__name__, layer_name)
         return scheme
 
+    def get_scheme_dict(
+        self, layer: torch.nn.Module, layer_name: str | None = None
+    ) -> dict[str, QuantizationArgs | str | None] | None:
+        """
+        Extract the QuantizationArgs for a given layer.
+
+        Returns:
+            dict with {
+                "weights": QuantizationArgs,
+                "input_activations": QuantizationArgs | None,
+                "format": str | None
+            } | None
+        """
+        # TODO (@kylesayrs): support ignore module names with ct matching utils
+        if should_ignore_layer(
+            layer_name, ignore=self.ignore, fused_mapping=self.packed_modules_mapping
+        ):
+            return None
+
+        # Will be empty for models with only sparsity
+        if self.target_scheme_map:
+            matched_target = find_matched_target(
+                layer_name=layer_name,
+                module=layer,
+                targets=self.target_scheme_map.keys(),
+                fused_mapping=self.packed_modules_mapping,
+            )
+
+            return self.target_scheme_map[matched_target]
+
+        return None
+
     def get_cache_scale(self, name: str) -> str | None:
         """
         Check whether the param name matches the format for k/v cache scales
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 06ee96d55419..80ee443d4dd6 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -8,6 +8,7 @@
 import torch
 from compressed_tensors import CompressionFormat
 from compressed_tensors.quantization import ActivationOrdering, QuantizationStrategy
+from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
@@ -22,6 +23,7 @@
     FusedMoEMethodBase,
     FusedMoEPermuteExpertsUnpermute,
     FusedMoeWeightScaleSupported,
+    UnquantizedFusedMoEMethod,
 )
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
@@ -44,21 +46,26 @@
     WNA16_SUPPORTED_BITS,
     WNA16_SUPPORTED_TYPES_MAP,
 )
-from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
-    find_matched_target,
-)
 from vllm.model_executor.layers.quantization.utils import replace_parameter
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    flashinfer_trtllm_fp4_moe,
+    prepare_static_weights_for_trtllm_fp4_moe,
     reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
 )
+from vllm.model_executor.layers.quantization.utils.flashinfer_utils import (
+    FlashinferMoeBackend,
+    get_flashinfer_moe_backend,
+)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     expert_weight_is_col_major,
     requant_weight_ue8m0_inplace,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
 )
@@ -96,7 +103,7 @@ class GPTQMarlinState(Enum):
     "CompressedTensorsW8A8Int8MoEMethod",
     "CompressedTensorsWNA16MarlinMoEMethod",
     "CompressedTensorsWNA16MoEMethod",
-    "CompressedTensorsW4A4MoeMethod",
+    "CompressedTensorsW4A4Nvfp4MoEMethod",
     "CompressedTensorsW4A8Int8MoEMethod",
 ]
 
@@ -106,39 +113,35 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase):
     def get_moe_method(
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         layer: torch.nn.Module,
+        layer_name: str,
     ) -> "CompressedTensorsMoEMethod":
-        # TODO: @dsikka: refactor this to use schemes as other kernels
-        # are supported + check if the layer is being ignored.
-        # Check if a using "Linear" to select schemes
-        if "Linear" in quant_config.target_scheme_map:
-            matched_target = "Linear"
-        else:
-            # May have instead defined the linear layers in the fused model
-
-            fused_layers = ["re:.*down_proj.*", "re:.*gate_proj.*", "re:.*up_proj.*"]
-            current_scheme = None
-            for fused_layer in fused_layers:
-                # Check if one of the fused layers are defined in quant_config
-                matched_target = find_matched_target(
-                    layer_name=fused_layer,
-                    module=layer,
-                    targets=quant_config.target_scheme_map.keys(),
-                    fused_mapping=quant_config.packed_modules_mapping,
-                )
+        # FusedMoE was made by combining multiple Linears so need to
+        # make sure quantization config for Linear can target it
+        quant_config._add_fused_moe_to_target_scheme_map()
+        unfused_names = [
+            layer_name + proj_name
+            for proj_name in [".0.gate_proj", ".0.up_proj", ".0.down_proj"]
+        ]
+        # TODO: refactor this to use expert_mapping and check all layer numbers
+        all_scheme_dicts = [
+            quant_config.get_scheme_dict(layer, name) for name in unfused_names
+        ]
+        scheme_dict = all_scheme_dicts.pop()
+
+        # multiple schemes found
+        if not all([cur_dict == scheme_dict for cur_dict in all_scheme_dicts]):
+            raise ValueError(
+                "All MoE projections need to have same "
+                "quantization scheme but found multiple"
+            )
 
-                # Only valid if down_proj, gate_proj, and up_proj
-                # are mapped to the same quant scheme in the quant_config
-                if current_scheme is None:
-                    current_scheme = quant_config.target_scheme_map.get(matched_target)
-                else:
-                    assert current_scheme == quant_config.target_scheme_map.get(
-                        matched_target
-                    )
+        if scheme_dict is None:  # ignored layer
+            return UnquantizedFusedMoEMethod(layer.moe_config)
 
-        weight_quant = quant_config.target_scheme_map[matched_target].get("weights")
-        input_quant = quant_config.target_scheme_map[matched_target].get(
-            "input_activations"
-        )
+        # TODO: @dsikka: refactor this to use schemes as other kernels
+        # are supported + check if the layer is being ignored.
+        weight_quant = scheme_dict.get("weights")
+        input_quant = scheme_dict.get("input_activations")
 
         if quant_config._is_wNa16_group_channel(weight_quant, input_quant):
             # group_size=None means channelwise
@@ -157,32 +160,40 @@ def get_moe_method(
                         "WNA16MoE is not supported with actorder=group/dynamic."
                     )
                 logger.info_once("Using CompressedTensorsWNA16MoEMethod")
-                return CompressedTensorsWNA16MoEMethod(quant_config, layer.moe_config)
+                return CompressedTensorsWNA16MoEMethod(
+                    quant_config, layer.moe_config, layer_name
+                )
             else:
                 logger.info_once("Using CompressedTensorsWNA16MarlinMoEMethod")
                 return CompressedTensorsWNA16MarlinMoEMethod(
-                    quant_config, layer.moe_config
+                    quant_config, layer.moe_config, layer_name
                 )
         elif quant_config._is_fp4a4_nvfp4(weight_quant, input_quant):
-            return CompressedTensorsW4A4MoeMethod(layer.moe_config)
+            return CompressedTensorsW4A4Nvfp4MoEMethod(layer.moe_config, layer_name)
         elif (
             quant_config._is_fp8_w8a8_sm90(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8_sm100(weight_quant, input_quant)
             or quant_config._is_fp8_w8a8(weight_quant, input_quant)
         ):
-            return CompressedTensorsW8A8Fp8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW8A8Fp8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         elif quant_config._is_dynamic_token_w8a8(weight_quant, input_quant):
-            return CompressedTensorsW8A8Int8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW8A8Int8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant):
-            return CompressedTensorsW4A8Int8MoEMethod(quant_config, layer.moe_config)
+            return CompressedTensorsW4A8Int8MoEMethod(
+                quant_config, layer.moe_config, layer_name
+            )
         else:
             raise RuntimeError(
                 f"Unsupported FusedMoe scheme: {weight_quant}, {input_quant}"
             )
 
 
-class CompressedTensorsW4A4MoeMethod(CompressedTensorsMoEMethod):
-    def __init__(self, moe: FusedMoEConfig):
+class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod):
+    def __init__(self, moe: FusedMoEConfig, layer_name: str | None = None):
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (  # noqa: E501
             detect_nvfp4_moe_support,
         )
@@ -193,6 +204,21 @@ def __init__(self, moe: FusedMoEConfig):
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
         self.group_size = 16
+        self.layer_name = layer_name
+        self.marlin_input_dtype = (
+            get_marlin_input_dtype(layer_name) if self.use_marlin else None
+        )
+        self.flashinfer_moe_backend = None
+        if self.allow_flashinfer:
+            self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            logger.info_once(
+                f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
+                " for CompressedTensorsW4A4Nvfp4MoEMethod."
+            )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for CompressedTensorsW4A4Nvfp4MoEMethod.")
+        else:
+            logger.info_once("Using Cutlass for CompressedTensorsW4A4Nvfp4MoEMethod.")
 
     def create_weights(
         self,
@@ -342,23 +368,22 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
 
         if self.use_marlin:
-            prepare_moe_fp4_layer_for_marlin(layer)
+            prepare_moe_fp4_layer_for_marlin(layer, input_dtype=self.marlin_input_dtype)
             return
-
-        # swizzle weight scales
-        layer.w13_weight_scale = torch.nn.Parameter(
-            swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
-        )
-
-        layer.w2_weight_scale = torch.nn.Parameter(
-            swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
-        )
-
         # w13
-        w13_input_global_scale = layer.w13_input_global_scale.max(dim=1).values.to(
-            torch.float32
-        )
-
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            w13_input_global_scale = (
+                layer.w13_input_global_scale.min()
+                .to(torch.float32)
+                .expand(layer.num_experts)
+            )
+        else:
+            w13_input_global_scale = layer.w13_input_global_scale.min(dim=1).values.to(
+                torch.float32
+            )
         layer.g1_alphas = torch.nn.Parameter(
             ((1 / w13_input_global_scale) * layer.w13_weight_scale_2),
             requires_grad=False,
@@ -369,22 +394,95 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
 
         # w2
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            w2_input_global_scale = (
+                layer.w2_input_global_scale.min()
+                .to(torch.float32)
+                .expand(layer.num_experts)
+            )
+        else:
+            w2_input_global_scale = layer.w2_input_global_scale
+
         layer.g2_alphas = torch.nn.Parameter(
-            ((1 / layer.w2_input_global_scale) * layer.w2_weight_scale_2).to(
-                torch.float32
-            ),
+            ((1 / w2_input_global_scale) * layer.w2_weight_scale_2).to(torch.float32),
             requires_grad=False,
         )
 
         layer.w2_input_scale_quant = torch.nn.Parameter(
-            (layer.w2_input_global_scale), requires_grad=False
+            (w2_input_global_scale), requires_grad=False
         )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
-        if self.use_marlin:
+        # TensorRT-LLM specific processing
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            # Prepare static weights for TRT-LLM kernel
+            # alternate: prepare_static_weight_layouts_for_trtllm_moe
+            (
+                gemm1_weights_fp4_shuffled,
+                gemm1_scales_fp4_shuffled,
+                gemm2_weights_fp4_shuffled,
+                gemm2_scales_fp4_shuffled,
+            ) = prepare_static_weights_for_trtllm_fp4_moe(
+                layer.w13_weight,
+                layer.w2_weight,
+                layer.w13_weight_scale,
+                layer.w2_weight_scale,
+                layer.w2_weight.size(-2),  # hidden_size
+                layer.w13_weight.size(-2) // 2,  # intermediate_size
+                layer.w13_weight.size(0),  # num_experts
+            )
+            logger.debug_once("Finished shuffling weights for TRT-LLM MOE")
+
+            layer.gemm1_weights_fp4_shuffled = Parameter(
+                gemm1_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_weights_fp4_shuffled = Parameter(
+                gemm2_weights_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm1_scales_fp4_shuffled = Parameter(
+                gemm1_scales_fp4_shuffled, requires_grad=False
+            )
+            layer.gemm2_scales_fp4_shuffled = Parameter(
+                gemm2_scales_fp4_shuffled, requires_grad=False
+            )
+
+            # Additional parameter needed for TRT-LLM
+            layer.g1_scale_c = Parameter(
+                (layer.w2_input_scale_quant * layer.g1_alphas).to(torch.float32),
+                requires_grad=False,
+            )
+
+            # Clean up weights that won't be used by TRT-LLM
+            del layer.w2_weight
+            del layer.w2_weight_scale
+            del layer.w13_weight
+            del layer.w13_weight_scale
+        else:
+            # swizzle weight scales
+            layer.w13_weight_scale = torch.nn.Parameter(
+                swizzle_blockscale(layer.w13_weight_scale), requires_grad=False
+            )
+
+            layer.w2_weight_scale = torch.nn.Parameter(
+                swizzle_blockscale(layer.w2_weight_scale), requires_grad=False
+            )
+
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
+        if self.use_marlin or (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
         elif not self.allow_flashinfer:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
         prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize(self.moe)
         logger.debug_once("%s", prepare_finalize.__class__.__name__)
@@ -408,7 +506,10 @@ def select_gemm_impl(
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.use_marlin:
+        if (
+            self.use_marlin
+            or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
             return None
 
         return nvfp4_moe_quant_config(
@@ -422,7 +523,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -443,25 +544,32 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW4A4MoeMethod` yet."
-            )
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        if (
+            self.allow_flashinfer
+            and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+        ):
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet."
+                )
+
+            return flashinfer_trtllm_fp4_moe(
+                layer=layer,
+                x=x,
+                router_logits=router_logits,
+                top_k=top_k,
+                global_num_experts=global_num_experts,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                custom_routing_function=custom_routing_function,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
@@ -482,6 +590,7 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
 
@@ -516,7 +625,7 @@ def apply(
             assert expert_map is None, (
                 "Expert Parallelism / expert_map "
                 "is currently not supported for "
-                "CompressedTensorsW4A4MoeMethod."
+                "CompressedTensorsW4A4Nvfp4MoEMethod."
             )
             assert self.moe_quant_config is not None
 
@@ -543,6 +652,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -596,6 +706,10 @@ def __init__(
             or self.is_fp8_w8a8_sm100
         )
         self.disable_expert_map = False
+        self.layer_name = layer_name
+        self.marlin_input_dtype = (
+            get_marlin_input_dtype(layer_name) if self.use_marlin else None
+        )
 
     def create_weights(
         self,
@@ -837,7 +951,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False)
 
         elif self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(layer, False)
+            prepare_moe_fp8_layer_for_marlin(
+                layer, False, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
@@ -890,11 +1006,14 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                     layer.w2_weight_scale
                 )
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or self.rocm_aiter_moe_enabled:
             return None
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -1001,7 +1120,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1022,31 +1141,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            assert expert_load_view is not None
-            assert logical_to_physical_map is not None
-            assert logical_replica_count is not None
-            assert isinstance(layer, FusedMoE)
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
         )
 
         per_act_token = self.input_quant.strategy == QuantizationStrategy.TOKEN
@@ -1069,6 +1166,7 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
 
@@ -1165,6 +1263,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1269,7 +1368,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1290,26 +1389,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsW8A8Int8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
@@ -1332,6 +1416,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1343,6 +1428,8 @@ def __init__(
         self.strategy = config.strategy
         self.group_size = config.group_size
         self.actorder = config.actorder
+        self.layer_name = layer_name
+        self.marlin_input_dtype = get_marlin_input_dtype(layer_name)
         assert config.symmetric, "Only symmetric quantization is supported for MoE"
 
         if not (
@@ -1417,6 +1504,9 @@ def create_weights(
             num_groups_w2 = w2_scales_size // self.group_size
             num_groups_w13 = hidden_size // self.group_size
 
+        layer.num_groups_w13 = num_groups_w13
+        layer.num_groups_w2 = num_groups_w2
+
         w13_scale = torch.nn.Parameter(
             torch.ones(
                 num_experts,
@@ -1500,6 +1590,17 @@ def create_weights(
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         num_experts = layer.w13_weight_g_idx.shape[0]
         device = layer.w13_weight_g_idx.device
+        is_a_8bit = (
+            self.marlin_input_dtype is not None
+            and self.marlin_input_dtype.itemsize == 1
+        )
+
+        if self.marlin_input_dtype == torch.float8_e4m3fn:
+            # NOTE: for non-zp quantization format only
+            ops.marlin_int4_fp8_preprocess(layer.w13_weight_packed, inplace=True)
+            ops.marlin_int4_fp8_preprocess(layer.w2_weight_packed, inplace=True)
+            layer.w13_weight_scale.data = layer.w13_weight_scale.data * 512
+            layer.w2_weight_scale.data = layer.w2_weight_scale.data * 512
 
         # when running models with grouped act order,
         # resort to g_idx values provided in checkpoint
@@ -1550,31 +1651,54 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_weight_packed.shape[1] * self.packed_factor,
             layer.w13_weight_packed.shape[2],
             self.num_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_weight_packed", marlin_w13_qweight)
+
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
             layer.w2_weight_packed,
             layer.w2_g_idx_sort_indices,
             layer.w2_weight_packed.shape[1] * self.packed_factor,
             layer.w2_weight_packed.shape[2],
             self.num_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_weight_packed", marlin_w2_qweight)
+
         # Repack scales
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_weight_scale,
             size_k=layer.w13_weight_packed.shape[2],
             size_n=layer.w13_weight_scale.shape[2],
             group_size=self.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.marlin_input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                torch.nn.Parameter(w13_input_global_scale, requires_grad=False),
+            )
         replace_parameter(layer, "w13_weight_scale", marlin_w13_scales)
+
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_weight_scale,
             size_k=layer.w2_weight_scale.shape[1]
             * (self.group_size if self.group_size != -1 else self.packed_factor),
             size_n=layer.w2_weight_scale.shape[2],
             group_size=self.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.marlin_input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                torch.nn.Parameter(w2_input_global_scale, requires_grad=False),
+            )
         replace_parameter(layer, "w2_weight_scale", marlin_w2_scales)
 
         layer.workspace = marlin_make_workspace_new(device, 4)
@@ -1630,7 +1754,7 @@ def select_gemm_impl(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1651,26 +1775,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsWNA16MarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", f"{activation} not supported for Marlin MoE."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
@@ -1684,6 +1793,8 @@ def apply(
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -1693,6 +1804,7 @@ def apply(
             sort_indices1=layer.w13_g_idx_sort_indices,
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
+            input_dtype=self.marlin_input_dtype,
             is_k_full=self.is_k_full,
         )
 
@@ -1702,6 +1814,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.quant_config = quant_config
@@ -1893,7 +2006,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1914,26 +2027,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `CompressedTensorsWNA16MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
@@ -1950,6 +2048,10 @@ def apply(
             quant_config=self.moe_quant_config,
         )
 
+    @property
+    def supports_eplb(self) -> bool:
+        return True
+
 
 class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod):
     """
@@ -1965,6 +2067,7 @@ def __init__(
         self,
         quant_config: "CompressedTensorsConfig",  # type: ignore # noqa E501
         moe: FusedMoEConfig,
+        layer_name: str | None = None,
     ):
         super().__init__(moe)
         self.has_bias = self.moe.has_bias
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
index b603bdb13280..c0b1e3ceeba3 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a4_nvfp4.py
@@ -184,7 +184,7 @@ def apply_weights(
             return out
 
         output_dtype = x.dtype
-        output_shape = [x.shape[0], layer.weight_packed.shape[0]]
+        output_shape = [*x.shape[:-1], layer.weight_packed.shape[0]]
 
         # quantize BF16 or FP16 to (FP4 and interleaved block scale)
         x_fp4, x_blockscale = scaled_fp4_quant(x, layer.input_global_scale)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 205abe942bd0..6029c6e744ed 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -189,6 +189,8 @@ def process_weights_after_loading(self, layer) -> None:
         if self.strategy == QuantizationStrategy.BLOCK:
             maybe_post_process_fp8_weight_block(layer)
 
+        self.fp8_linear.process_weights_after_loading(layer)
+
     def apply_weights(
         self,
         layer: torch.nn.Module,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 2267395fe67d..3f1b4d883b79 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -14,7 +14,11 @@
     MPLinearLayerConfig,
     choose_mp_linear_kernel,
 )
+from vllm.model_executor.layers.quantization.kernels.mixed_precision.marlin import (
+    MarlinLinearKernel,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
     marlin_repeat_scales_on_all_ranks,
 )
 from vllm.model_executor.parameter import (
@@ -45,12 +49,14 @@ def __init__(
         group_size: int | None = None,
         symmetric: bool | None = True,
         actorder: ActivationOrdering | None = None,
+        layer_name: str | None = None,
     ):
         self.pack_factor = 32 // num_bits
         self.strategy = strategy
         self.symmetric = symmetric
         self.group_size = -1 if group_size is None else group_size
         self.has_g_idx = actorder == ActivationOrdering.GROUP
+        self.layer_name = layer_name
 
         if self.group_size == -1 and self.strategy != "channel":
             raise ValueError(
@@ -73,8 +79,8 @@ def __init__(
 
     @classmethod
     def get_min_capability(cls) -> int:
-        # ampere and up
-        return 80
+        # Turing and up
+        return 75
 
     def create_weights(
         self,
@@ -108,6 +114,11 @@ def create_weights(
             logger.info("Using %s for CompressedTensorsWNA16", kernel_type.__name__)
             self._kernel_backends_being_used.add(kernel_type.__name__)
 
+        if isinstance(kernel_type, MarlinLinearKernel):
+            input_dtype = get_marlin_input_dtype(self.layer_name)
+            if input_dtype is not None:
+                mp_linear_kernel_config.act_type = input_dtype
+
         # If group_size is -1, we are in channelwise case.
         group_size = self.group_size if self.group_size != -1 else input_size
         row_parallel = input_size != input_size_per_partition
diff --git a/vllm/model_executor/layers/quantization/cpu_wna16.py b/vllm/model_executor/layers/quantization/cpu_wna16.py
new file mode 100644
index 000000000000..bf643f55f1b9
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/cpu_wna16.py
@@ -0,0 +1,625 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Any, Optional
+
+import torch
+from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE
+
+from vllm._custom_ops import (
+    cpu_gemm_wna16,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.quantization.utils.gptq_utils import (
+    get_linear_quant_method,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    marlin_repeat_scales_on_all_ranks,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    is_layer_skipped,
+    pack_cols,
+    unpack_cols,
+)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.utils import WeightsMapper
+from vllm.model_executor.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedColumnParameter,
+    PackedvLLMParameter,
+    RowvLLMParameter,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
+from vllm.transformers_utils.config import get_safetensors_params_metadata
+from vllm.utils.collection_utils import is_list_of
+
+logger = init_logger(__name__)
+
+
+class CPUGPTQConfig(QuantizationConfig):
+    """Config class for CPU GPTQ quant"""
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        is_sym: bool,
+        lm_head_quantized: bool,
+        dynamic: dict[str, dict[str, int | bool]],
+        full_config: dict[str, Any],
+        modules_in_block_to_quantize: list[str] | None = None,
+    ) -> None:
+        super().__init__()
+        if desc_act and group_size == -1:
+            # In this case, act_order == True is the same as act_order == False
+            # (since we have only one group per output channel)
+            desc_act = False
+
+        # GPTQModel use `dynamic` config property to allow per module
+        # quantization config so each module can be individually optimized.
+        # Format is dict[str, dict] where key is a regex string that can
+        # perform both positive ("+:" prefixed) or negative ("-:" prefixed)
+        # matching of a module.
+        # Default to positive match, override base quant config mode, if no
+        # prefix is used. Value is in dict format of field key and override
+        # value.
+        # Negative matching will skip quantization init for this module
+        # entirely:
+        # non-quantized inference. More details and quantization examples can be
+        # found at: https://github.com/ModelCloud/GPTQModel
+        # Example:
+        #  # last 1/2 of the layers 10-21 has 8bit vs 4bit for 0-9
+        #  # last 1/4 of the layers 16-21 has 8bit and group_size 64
+        # dynamic = {
+        #  #`.*\.` matches the layers_node prefix
+        #  # positive match layer 10-15
+        #  r"+:.*\.(?:1[0-5])\..*": {"bits": 8,},
+        #  # positive match layer 16-21
+        #  r"+:.*\.(?:1[6-9]|20|21)\..*": {"bits": 8, "group_size": 64,},
+        #  r"-:.*\.moe\..*": {}, # negative match (skip) all `moe` layers
+        # }
+        assert weight_bits == 4
+        self.dynamic = dynamic
+        self.weight_bits = weight_bits
+        self.is_sym = is_sym
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.full_config = full_config
+        self.modules_in_block_to_quantize = modules_in_block_to_quantize or []
+
+    def __repr__(self) -> str:
+        return (
+            f"CPUWNA16Config("
+            f"group_size={self.group_size}, "
+            f"desc_act={self.desc_act}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"dynamic={self.dynamic}, "
+            f"modules_in_block_to_quantize={self.modules_in_block_to_quantize})"
+        )
+
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "cpu_gptq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return -1
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "CPUGPTQConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
+        dynamic = cls.get_from_keys_or(config, ["dynamic"], default={})
+        group_size = cls.get_from_keys(config, ["group_size"])
+        is_sym = cls.get_from_keys(config, ["sym"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        modules_in_block_to_quantize = cls.get_from_keys_or(
+            config, ["modules_in_block_to_quantize"], default=None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            desc_act,
+            is_sym,
+            lm_head_quantized,
+            dynamic,
+            config,
+            modules_in_block_to_quantize,
+        )
+
+    @classmethod
+    def override_quantization_method(
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+        if current_platform.is_cpu() and (quant_method == "gptq"):
+            return cls.get_name()
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        return get_linear_quant_method(self, layer, prefix, CPUGPTQLinearMethod)  # type: ignore
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper):
+        if self.modules_in_block_to_quantize is not None:
+            self.modules_in_block_to_quantize = hf_to_vllm_mapper.apply_list(
+                self.modules_in_block_to_quantize
+            )
+
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
+        if self.modules_in_block_to_quantize:
+            if is_list_of(self.modules_in_block_to_quantize, list):
+                # original modules_in_block_to_quantize: list[list[str]]
+                # flatten original modules_in_block_to_quantize
+                self.modules_in_block_to_quantize = [
+                    item
+                    for sublist in self.modules_in_block_to_quantize
+                    for item in sublist
+                ]
+            return
+
+        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
+        quant_layers: set[str] = {
+            param_name.rsplit(".", 1)[0]
+            for param_name, info in metadata.items()
+            if (dtype := info.get("dtype", None))
+            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
+        }
+        self.modules_in_block_to_quantize = list(quant_layers)
+
+
+class CPUGPTQLinearMethod(LinearMethodBase):
+    """Linear method for GPTQ on CPU.
+
+    Args:
+        quant_config: The CPUWNA16 quantization config.
+    """
+
+    def __init__(self, quant_config: CPUGPTQConfig) -> None:
+        self.quant_config = quant_config
+        assert self.quant_config.is_sym, "GPTQ asym quant is not supported on CPU"
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        output_size_per_partition = sum(output_partition_sizes)
+        assert output_size_per_partition * self.quant_config.weight_bits % 32 == 0
+        assert output_size_per_partition % 32 == 0
+        assert input_size_per_partition % 32 == 0
+
+        is_row_parallel = input_size != input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        # Determine sharding
+        if marlin_repeat_scales_on_all_ranks(
+            self.quant_config.desc_act, self.quant_config.group_size, is_row_parallel
+        ):
+            # By setting scale_dim == None, weight_loader will
+            # repeat the scales on each rank in TP>1 case.
+            scales_and_zp_input_dim = None
+            scales_and_zp_size = input_size // group_size
+        else:
+            # By setting scale_dim == 0, weight_loader will
+            # shard the scales in TP>1 case.
+            scales_and_zp_input_dim = 0
+            scales_and_zp_size = input_size_per_partition // group_size
+
+        # Quantized weights
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        # Activation order
+        g_idx = RowvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            weight_loader=weight_loader,
+        )
+        set_weight_attrs(
+            g_idx,
+            {"ignore_warning": True},
+        )
+
+        qzeros_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader": weight_loader,
+        }
+        weight_scale_args = {
+            "data": torch.empty(
+                scales_and_zp_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+
+        if scales_and_zp_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args,
+            )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("scales", scales)
+        layer.register_parameter("qzeros", qzeros)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        torch.set_printoptions(profile="full", linewidth=5000, sci_mode=False)
+        packed_weight = layer.qweight.data
+        bits = self.quant_config.weight_bits
+        pack_factor = int(self.quant_config.pack_factor)
+        p_w_k, p_w_n = packed_weight.size()
+        input_size = p_w_k * pack_factor
+        output_size = p_w_n
+        isa_hint = _get_isa_hint(layer.scales.dtype)
+        layer.isa_hint = isa_hint
+
+        layer.qzeros = None
+        if not self.quant_config.desc_act:
+            layer.g_idx = None
+
+        # convert input dim packed to output dim packed
+        weight = unpack_cols(packed_weight, bits, p_w_k, p_w_n * pack_factor).view(
+            p_w_k, p_w_n, pack_factor
+        )
+        weight = weight.permute(0, 2, 1).reshape(input_size, output_size).contiguous()
+        weight = pack_cols(weight, bits, input_size, output_size)
+        # make 16 output channel as a block and transpose to the make
+        # the block contigous
+        weight = (
+            weight.view(input_size, -1, 16 // pack_factor)
+            .permute(1, 0, 2)
+            .reshape(-1, input_size * 16 // pack_factor)
+            .contiguous()
+        )
+        layer.qweight.data = weight
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = cpu_gemm_wna16(
+            input=x,
+            q_weight=layer.qweight,
+            scales=layer.scales,
+            zeros=layer.qzeros,
+            g_idx=layer.g_idx,
+            bias=bias,
+            pack_factor=8,
+            isa_hint=layer.isa_hint,
+        )
+        return x
+
+
+class CPUAWQConfig(QuantizationConfig):
+    """Config class for CPU AWQ"""
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        zero_point: bool,
+        lm_head_quantized: bool,
+        modules_to_not_convert: list[str] | None,
+        full_config: dict[str, Any],
+    ) -> None:
+        super().__init__()
+        assert weight_bits == 4
+        self.pack_factor = 32 // weight_bits  # packed into int32
+        self.group_size = group_size
+        self.zero_point = zero_point
+        self.lm_head_quantized = lm_head_quantized
+        self.weight_bits = weight_bits
+        self.modules_to_not_convert = modules_to_not_convert or []
+        self.full_config = full_config
+
+    def __repr__(self) -> str:
+        return (
+            f"AWQMarlinConfig("
+            f"group_size={self.group_size}, "
+            f"zero_point={self.zero_point}, "
+            f"lm_head_quantized={self.lm_head_quantized}, "
+            f"modules_to_not_convert={self.modules_to_not_convert})"
+        )
+
+    @classmethod
+    def get_name(cls) -> "QuantizationMethods":
+        return "cpu_awq"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return -1
+
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "CPUAWQConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        zero_point = cls.get_from_keys(config, ["zero_point"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
+        modules_to_not_convert = cls.get_from_keys_or(
+            config, ["modules_to_not_convert"], None
+        )
+        return cls(
+            weight_bits,
+            group_size,
+            zero_point,
+            lm_head_quantized,
+            modules_to_not_convert,
+            config,
+        )
+
+    @classmethod
+    def override_quantization_method(
+        cls, hf_quant_cfg, user_quant
+    ) -> Optional["QuantizationMethods"]:
+        quant_method = hf_quant_cfg.get("quant_method", "").lower()
+        if current_platform.is_cpu() and (quant_method == "awq"):
+            return cls.get_name()
+        return None
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase) or (
+            isinstance(layer, ParallelLMHead) and self.lm_head_quantized
+        ):
+            if is_layer_skipped(
+                prefix,
+                self.modules_to_not_convert,
+                self.packed_modules_mapping,
+                skip_with_substr=True,
+            ):
+                return UnquantizedLinearMethod()
+            return CPUAWQLinearMethod(self)
+        return None
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if self.modules_to_not_convert:
+            self.modules_to_not_convert = hf_to_vllm_mapper.apply_list(
+                self.modules_to_not_convert
+            )
+
+    def maybe_update_config(self, model_name: str, revision: str | None = None):
+        if self.modules_to_not_convert:
+            return
+
+        unquant_dtypes = [torch.float16, torch.bfloat16, torch.float32]
+        metadata = get_safetensors_params_metadata(model_name, revision=revision)
+        layers = {param_name.rsplit(".", 1)[0] for param_name in metadata}
+        quant_layers: set[str] = {
+            param_name.rsplit(".", 1)[0]
+            for param_name, info in metadata.items()
+            if (dtype := info.get("dtype", None))
+            and _SAFETENSORS_TO_TORCH_DTYPE[dtype] not in unquant_dtypes
+        }
+        self.modules_to_not_convert = list(layers - quant_layers)
+
+
+class CPUAWQLinearMethod(LinearMethodBase):
+    """Linear method for CPU AWQ.
+
+    Args:
+        quant_config: The CPU AWQ quantization config.
+    """
+
+    def __init__(self, quant_config: CPUAWQConfig) -> None:
+        self.quant_config = quant_config
+        assert self.quant_config.zero_point
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        del output_size
+        output_size_per_partition = sum(output_partition_sizes)
+        weight_loader = extra_weight_attrs.get("weight_loader")
+
+        # Normalize group_size
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        num_groups = input_size_per_partition // group_size
+
+        qzeros = PackedvLLMParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader,
+        )
+
+        scales = GroupQuantScaleParameter(
+            data=torch.empty(
+                num_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            input_dim=0,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        torch.set_printoptions(profile="full", linewidth=5000, sci_mode=False)
+        packed_weight = layer.qweight.data
+        packed_zeros = layer.qzeros.data
+        group_num = packed_zeros.size(0)
+        bits = self.quant_config.weight_bits
+        pack_factor = int(self.quant_config.pack_factor)
+        input_size, packed_output_size = packed_weight.size()
+        output_size = packed_output_size * pack_factor
+        isa_hint = _get_isa_hint(layer.scales.dtype)
+        layer.isa_hint = isa_hint
+
+        interleave_map = (0, 4, 1, 5, 2, 6, 3, 7)
+        weight = unpack_cols(
+            packed_weight,
+            bits,
+            input_size,
+            output_size,
+        )
+        zeros = unpack_cols(
+            packed_zeros,
+            bits,
+            group_num,
+            output_size,
+        )
+        weight = (
+            weight.view(input_size, -1, pack_factor)[:, :, interleave_map]
+            .reshape(input_size, output_size)
+            .contiguous()
+        )
+        zeros = (
+            zeros.view(group_num, -1, pack_factor)[:, :, interleave_map]
+            .reshape(group_num, output_size)
+            .contiguous()
+        )
+
+        zeros = pack_cols(zeros, bits, group_num, output_size).contiguous()
+        # make 16 output channel as a block and transpose to
+        # the make the block contigous
+        weight = pack_cols(weight, bits, input_size, output_size)
+        weight = (
+            weight.view(input_size, -1, 16 // pack_factor)
+            .permute(1, 0, 2)
+            .reshape(-1, input_size * 16 // pack_factor)
+            .contiguous()
+        )
+        layer.qweight.data = weight
+        layer.qzeros.data = zeros
+
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        x = cpu_gemm_wna16(
+            input=x,
+            q_weight=layer.qweight,
+            scales=layer.scales,
+            zeros=layer.qzeros,
+            g_idx=None,
+            bias=bias,
+            pack_factor=8,
+            isa_hint=layer.isa_hint,
+        )
+        return x
+
+
+def _get_isa_hint(dtype: torch.dtype) -> str:
+    supports_amx = torch._C._cpu._is_amx_tile_supported()
+    if supports_amx and dtype in (torch.bfloat16,):
+        return "amx"
+    else:
+        return "vec"
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 5241f9a2301b..7ebe40ec8468 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -137,7 +137,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -158,26 +158,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ExpertsInt8MoEMethod` yet."
-            )
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 67e5b65de601..ede368735ded 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -3,6 +3,7 @@
 
 from collections.abc import Callable
 from enum import Enum
+from functools import partial
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -13,6 +14,7 @@
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.layer import Attention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
@@ -27,6 +29,7 @@
     FusedMoeWeightScaleSupported,
 )
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
     RoutingMethodType,
     fp8_w8a8_moe_quant_config,
@@ -69,6 +72,9 @@
     process_fp8_weight_tensor_strategy,
     validate_fp8_block_shape,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     apply_fp8_marlin_linear,
     prepare_fp8_layer_for_marlin,
@@ -120,15 +126,20 @@ class Fp8MoeBackend(Enum):
     TRITON = 6
 
 
-def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
+def get_fp8_moe_backend(
+    block_quant: bool, moe_parallel_config: FusedMoEParallelConfig
+) -> Fp8MoeBackend:
     """
     Select the primary FP8 MoE backend
     Note: Shape-specific fallbacks may still occur at runtime.
     """
-    # prefer FlashInfer backends when available and enabled on supported GPUs
+    # Prefer FlashInfer backends on supported GPUs; allow SM90 and SM100.
     if (
         current_platform.is_cuda()
-        and current_platform.is_device_capability(100)
+        and (
+            current_platform.is_device_capability(100)
+            or current_platform.is_device_capability(90)
+        )
         and envs.VLLM_USE_FLASHINFER_MOE_FP8
         and has_flashinfer_moe()
     ):
@@ -137,14 +148,14 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
             logger.info_once("Using FlashInfer FP8 MoE TRTLLM backend for SM100")
             return Fp8MoeBackend.FLASHINFER_TRTLLM
         else:
-            if block_quant:
+            if block_quant and current_platform.is_device_capability(100):
                 raise ValueError(
                     "FlashInfer FP8 MoE throughput backend does not "
                     "support block quantization. Please use "
                     "VLLM_FLASHINFER_MOE_BACKEND=latency "
                     "instead."
                 )
-            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM100")
+            logger.info_once("Using FlashInfer FP8 MoE CUTLASS backend for SM90/SM100")
             return Fp8MoeBackend.FLASHINFER_CUTLASS
 
     # weight-only path for older GPUs without native FP8
@@ -158,12 +169,25 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         logger.info_once("Using Marlin backend for FP8 MoE")
         return Fp8MoeBackend.MARLIN
 
-    # deepGEMM on supported platforms with block-quantized weights
-    if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant:
+    # Determine if we should use DeepGEMM with block-quantized weights:
+    # - If explicitly set by user, respect their choice
+    # - If not explicitly set (default), disable when TP size is >= 8
+    moe_use_deep_gemm = envs.VLLM_MOE_USE_DEEP_GEMM
+    if not envs.is_set("VLLM_MOE_USE_DEEP_GEMM") and moe_parallel_config.tp_size >= 8:
+        moe_use_deep_gemm = False
+        logger.info_once(
+            "DeepGEMM MoE is disabled by default when TP size is >= 8. "
+            "Set VLLM_MOE_USE_DEEP_GEMM=1 to enable it.",
+            scope="local",
+        )
+
+    if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant:
         if not has_deep_gemm():
-            logger.warning_once("DeepGEMM backend requested but not available.")
+            logger.warning_once(
+                "DeepGEMM backend requested but not available.", scope="local"
+            )
         elif is_deep_gemm_supported():
-            logger.info_once("Using DeepGEMM backend for FP8 MoE")
+            logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local")
             return Fp8MoeBackend.DEEPGEMM
 
     # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights
@@ -172,7 +196,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend:
         and current_platform.is_device_capability(100)
         and block_quant
     ):
-        logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE")
+        logger.info_once(
+            "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local"
+        )
         return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM
 
     # default to Triton
@@ -258,7 +284,6 @@ def from_config(cls, config: dict[str, Any]) -> "Fp8Config":
     def get_xpu_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention
         from vllm.model_executor.layers.quantization.ipex_quant import (
             XPUFp8LinearMethod,
             XPUFp8MoEMethod,
@@ -288,8 +313,6 @@ def get_xpu_quant_method(
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if current_platform.is_xpu():
             return self.get_xpu_quant_method(layer, prefix)
         if isinstance(layer, LinearBase):
@@ -299,7 +322,9 @@ def get_quant_method(
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedLinearMethod()
-            return Fp8LinearMethod(self)
+            quant_method = Fp8LinearMethod(self)
+            quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
         elif isinstance(layer, FusedMoE):
             if is_layer_skipped(
                 prefix=prefix,
@@ -307,7 +332,9 @@ def get_quant_method(
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedFusedMoEMethod(layer.moe_config)
-            return Fp8MoEMethod(self, layer)
+            moe_quant_method = Fp8MoEMethod(self, layer)
+            moe_quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
         elif isinstance(layer, Attention):
             return Fp8KVCacheMethod(self)
         return None
@@ -358,6 +385,7 @@ def __init__(self, quant_config: Fp8Config):
 
         # For GPUs that lack FP8 hardware support, we can leverage the Marlin
         # kernel for fast weight-only FP8 quantization
+        self.marlin_input_dtype = None
         self.use_marlin = (
             not current_platform.has_device_capability(89)
             or envs.VLLM_TEST_FORCE_FP8_MARLIN
@@ -538,7 +566,9 @@ def process_weights_after_loading(self, layer: Module) -> None:
         )
 
         if self.use_marlin:
-            prepare_fp8_layer_for_marlin(layer, size_k_first)
+            prepare_fp8_layer_for_marlin(
+                layer, size_k_first, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.input_scale
             return
@@ -596,6 +626,7 @@ def apply(
                 workspace=layer.workspace,
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
+                input_dtype=self.marlin_input_dtype,
                 bias=bias,
             )
 
@@ -632,14 +663,27 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module):
         self.quant_config = quant_config
         self.weight_block_size = self.quant_config.weight_block_size
         self.block_quant: bool = self.weight_block_size is not None
-        self.fp8_backend = get_fp8_moe_backend(self.block_quant)
+        self.fp8_backend = get_fp8_moe_backend(
+            self.block_quant, layer.moe_parallel_config
+        )
 
+        self.marlin_input_dtype = None
         self.use_marlin = self.fp8_backend == Fp8MoeBackend.MARLIN
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
         if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM:
             self.flashinfer_moe_backend = FlashinferMoeBackend.TENSORRT_LLM
         elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS:
             self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            self.flashinfer_moe_fn = partial(
+                flashinfer_cutlass_moe_fp8,
+                moe=self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
+            )
 
         self.allow_deep_gemm = self.fp8_backend == Fp8MoeBackend.DEEPGEMM
         self.allow_cutlass_block_scaled_grouped_gemm = (
@@ -998,12 +1042,17 @@ def process_weights_after_loading(self, layer: Module) -> None:
                 layer.w13_weight.data = w13_weight.data
 
         if self.use_marlin:
-            prepare_moe_fp8_layer_for_marlin(layer, False)
+            prepare_moe_fp8_layer_for_marlin(
+                layer, False, input_dtype=self.marlin_input_dtype
+            )
             # Activations not quantized for marlin.
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if (
             self.rocm_aiter_moe_enabled
             or self.use_marlin
@@ -1011,13 +1060,20 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
         ):
             return None
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            if self.block_quant:
+                assert self.weight_block_size == [128, 128], (
+                    f"Only support weight_block_size == [128, 128], "
+                    f"got {self.weight_block_size}"
+                )
+            # Wire block-scale flag through prepare/finalize when using CUTLASS
             prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-                self.moe
+                self.moe,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -1061,9 +1117,11 @@ def select_gemm_impl(
             )
 
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+            # Select GEMM experts with block-scale when weights are block-quantized
             experts = select_cutlass_fp8_gemm_impl(
                 self.moe,
                 self.moe_quant_config,
+                use_deepseek_fp8_block_scale=self.block_quant,
             )
             logger.debug_once("Using %s", experts.__class__.__name__)
             return experts
@@ -1109,7 +1167,7 @@ def allow_inplace(self) -> bool:
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1185,31 +1243,9 @@ def apply(
                     apply_router_weight_on_input=apply_router_weight_on_input,
                 )
 
-        zero_expert_num = getattr(layer, "zero_expert_num", 0)
-        zero_expert_type = getattr(layer, "zero_expert_type", None)
-
-        select_result = FusedMoE.select_experts(
+        select_result = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
-            enable_eplb=enable_eplb,
-            expert_map=expert_map,
-            expert_load_view=expert_load_view,
-            logical_to_physical_map=logical_to_physical_map,
-            logical_replica_count=logical_replica_count,
-            global_num_experts=global_num_experts,
-            zero_expert_num=zero_expert_num,
-            zero_expert_type=zero_expert_type,
-            num_fused_shared_experts=layer.num_fused_shared_experts,
         )
 
         topk_weights, topk_ids, zero_expert_result = select_result
@@ -1247,19 +1283,21 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
                 workspace=layer.workspace,
             )
         elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not self.block_quant
-            assert not renormalize and custom_routing_function is not None
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
-            assert scoring_func == "sigmoid", (
-                f"Expected 'sigmoid' scoring func but got {scoring_func}"
-            )
-
-            result = flashinfer_cutlass_moe_fp8(
+            if not self.block_quant:
+                assert not renormalize and custom_routing_function is not None
+                assert scoring_func == "sigmoid", (
+                    f"Expected 'sigmoid' scoring func but got {scoring_func}"
+                )
+            # Delegate to CUTLASS FlashInfer path; function already bound with
+            # use_deepseek_fp8_block_scale for block-quant when applicable
+            result = self.flashinfer_moe_fn(
                 x,
                 layer,
                 topk_weights,
@@ -1290,7 +1328,8 @@ def apply(
                     self.allow_cutlass_block_scaled_grouped_gemm
                 ),
             )
-        if zero_expert_num != 0 and zero_expert_type is not None:
+
+        if layer.zero_expert_num != 0 and layer.zero_expert_type is not None:
             assert not isinstance(result, tuple), (
                 "Shared + zero experts are mutually exclusive not yet supported"
             )
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index caabcd0ca0ee..bcdfafb50fc5 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-from collections.abc import Callable
+from collections.abc import Callable, Mapping
+from types import MappingProxyType
 from typing import Any, Optional
 
 import gguf
@@ -26,7 +27,11 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    UnquantizedEmbeddingMethod,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils.torch_utils import direct_register_custom_op
 
@@ -65,18 +70,70 @@ def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
         if isinstance(layer, LinearBase):
-            if is_layer_skipped_gguf(prefix, self.unquantized_modules):
+            if is_layer_skipped_gguf(
+                prefix, self.unquantized_modules, self.packed_modules_mapping
+            ):
                 return UnquantizedLinearMethod()
             return GGUFLinearMethod(self)
         elif isinstance(layer, VocabParallelEmbedding):
+            if is_layer_skipped_gguf(
+                prefix, self.unquantized_modules, self.packed_modules_mapping
+            ):
+                return UnquantizedEmbeddingMethod()
             return GGUFEmbeddingMethod(self)
         elif isinstance(layer, FusedMoE):
             return GGUFMoEMethod(self, layer.moe_config)
         return None
 
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        if self.unquantized_modules is not None:
+            self.unquantized_modules = hf_to_vllm_mapper.apply_list(
+                self.unquantized_modules
+            )
+
+
+def is_layer_skipped_gguf(
+    prefix: str,
+    unquantized_modules: list[str],
+    fused_mapping: Mapping[str, list[str]] = MappingProxyType({}),
+):
+    # Fused layers like gate_up_proj or qkv_proj will not be fused
+    # in the safetensors checkpoint. So, we convert the name
+    # from the fused version to unfused + check to make sure that
+    # each shard of the fused layer has the same scheme.
+    proj_name = prefix.split(".")[-1]
+    if proj_name in fused_mapping:
+        shard_prefixes = [
+            prefix.replace(proj_name, shard_proj_name)
+            for shard_proj_name in fused_mapping[proj_name]
+        ]
+
+        is_skipped = None
+        for shard_prefix in shard_prefixes:
+            is_shard_skipped = any(
+                shard_prefix in module_name for module_name in unquantized_modules
+            )
+
+            if is_skipped is None:
+                is_skipped = is_shard_skipped
+            elif is_shard_skipped != is_skipped:
+                raise ValueError(
+                    f"Detected some but not all shards of {prefix} "
+                    "are quantized. All shards of fused layers "
+                    "to have the same precision."
+                )
+    else:
+        is_skipped = any(module_name in prefix for module_name in unquantized_modules)
 
-def is_layer_skipped_gguf(prefix: str, unquantized_modules: list[str]):
-    return any(module_name in prefix for module_name in unquantized_modules)
+    assert is_skipped is not None
+    return is_skipped
 
 
 UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16}
@@ -564,7 +621,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -585,9 +642,6 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `GGUFMoEMethod` yet.")
-
         assert activation == "silu", "Only SiLU activation is supported."
         if apply_router_weight_on_input:
             raise NotImplementedError(
@@ -595,19 +649,9 @@ def apply(
                 "fused GGUF MoE method."
             )
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
         return fused_moe_gguf(
             x,
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 68a122fd46c6..56034e11329d 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -41,6 +41,8 @@
 from vllm.model_executor.layers.quantization.utils.marlin_utils import (
     check_marlin_supported,
     check_moe_marlin_supports_layer,
+    get_marlin_input_dtype,
+    marlin_act_int8_process_scales,
     marlin_make_workspace_new,
     marlin_moe_permute_scales,
     marlin_permute_bias,
@@ -251,8 +253,21 @@ def get_quant_method(
                 return MoeWNA16Config.from_config(self.full_config).get_quant_method(
                     layer, prefix
                 )
-            return get_moe_quant_method(self, layer, prefix, GPTQMarlinMoEMethod)
-        return get_linear_quant_method(self, layer, prefix, GPTQMarlinLinearMethod)
+            moe_quant_method = get_moe_quant_method(
+                self, layer, prefix, GPTQMarlinMoEMethod
+            )
+            if moe_quant_method is None:
+                return None
+            moe_quant_method.input_dtype = get_marlin_input_dtype(prefix)
+            return moe_quant_method
+
+        quant_method = get_linear_quant_method(
+            self, layer, prefix, GPTQMarlinLinearMethod
+        )
+        if quant_method is None:
+            return None
+        quant_method.input_dtype = get_marlin_input_dtype(prefix)
+        return quant_method
 
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: dict[str, Any]):
@@ -319,6 +334,8 @@ class GPTQMarlinLinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: GPTQMarlinConfig) -> None:
         self.quant_config = quant_config
+        self.input_dtype = None
+        self.quant_type = self.quant_config.quant_type
 
         # Verify supported on platform.
         verify_marlin_supported(
@@ -339,6 +356,7 @@ def create_weights(
         output_size_per_partition = sum(output_partition_sizes)
         is_row_parallel = input_size != input_size_per_partition
         weight_loader = extra_weight_attrs.get("weight_loader")
+        input_dtype = self.input_dtype
 
         mp_linear_kernel_config = MPLinearLayerConfig(
             full_weight_shape=(input_size, output_size),
@@ -347,7 +365,7 @@ def create_weights(
                 output_size_per_partition,
             ),
             weight_type=self.quant_config.quant_type,
-            act_type=params_dtype,
+            act_type=params_dtype if input_dtype is None else input_dtype,
             group_size=self.quant_config.group_size,
             zero_points=False,
             has_g_idx=self.quant_config.desc_act,
@@ -482,6 +500,7 @@ def __init__(
             self.quant_type = scalar_types.uint8b128
         else:
             raise ValueError("GPTQMarlinMoEMethod only supports int4 and int8 now.")
+        self.input_dtype = None
         self.use_marlin = True
 
     def create_weights(
@@ -493,6 +512,14 @@ def create_weights(
         params_dtype: torch.dtype,
         **extra_weight_attrs,
     ):
+        layer.input_dtype = self.input_dtype
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if is_a_8bit:
+            assert self.quant_type == scalar_types.uint4b8, (
+                "W8A8-INT8 is not supported by marlin kernel."
+            )
+
         intermediate_size_full = extra_weight_attrs.pop("intermediate_size_full")
 
         self.is_k_full = (not self.quant_config.desc_act) or (
@@ -513,6 +540,9 @@ def create_weights(
             scales_size2 = 1
             strategy = FusedMoeWeightScaleSupported.CHANNEL.value
 
+        layer.num_groups_w13 = scales_size13
+        layer.num_groups_w2 = scales_size2
+
         extra_weight_attrs.update({"quant_method": strategy, "is_transposed": True})
         # Fused gate_up_proj (column parallel)
         w13_qweight = torch.nn.Parameter(
@@ -630,6 +660,19 @@ def create_weights(
         layer.workspace = marlin_make_workspace_new(device, 4)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        is_a_8bit = self.input_dtype is not None and self.input_dtype.itemsize == 1
+
+        if is_a_8bit:
+            assert self.quant_type == scalar_types.uint4b8, (
+                "W8A8-INT8 is not supported by marlin kernel."
+            )
+
+        if self.input_dtype == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(layer.w13_qweight, inplace=True)
+            ops.marlin_int4_fp8_preprocess(layer.w2_qweight, inplace=True)
+            layer.w13_scales.data = layer.w13_scales.data * 512
+            layer.w2_scales.data = layer.w2_scales.data * 512
+
         # Process act_order
         if self.quant_config.desc_act:
             # Get sorting based on g_idx
@@ -678,6 +721,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w13_qweight.shape[2],
             self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w13_qweight", marlin_w13_qweight)
         marlin_w2_qweight = ops.gptq_marlin_moe_repack(
@@ -686,6 +730,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w2_qweight.shape[1] * self.quant_config.pack_factor,
             layer.w2_qweight.shape[2],
             self.quant_config.quant_type.size_bits,
+            is_a_8bit=is_a_8bit,
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
         # Repack scales
@@ -694,7 +739,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             size_k=layer.intermediate_size_per_partition,
             size_n=layer.w13_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w13 > 1:
+            marlin_w13_scales, w13_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w13_scales
+            )
+            layer.register_parameter(
+                "w13_input_global_scale",
+                torch.nn.Parameter(w13_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w13_scales", marlin_w13_scales)
         marlin_w2_scales = marlin_moe_permute_scales(
             s=layer.w2_scales,
@@ -706,7 +761,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             ),
             size_n=layer.w2_scales.shape[2],
             group_size=self.quant_config.group_size,
+            is_a_8bit=is_a_8bit,
         )
+        if self.input_dtype == torch.int8 and layer.num_groups_w2 > 1:
+            marlin_w2_scales, w2_input_global_scale = marlin_act_int8_process_scales(
+                marlin_w2_scales
+            )
+            layer.register_parameter(
+                "w2_input_global_scale",
+                torch.nn.Parameter(w2_input_global_scale, requires_grad=False),
+            )
+
         replace_parameter(layer, "w2_scales", marlin_w2_scales)
 
         if hasattr(layer, "w13_bias") and layer.w13_bias is not None:
@@ -722,7 +787,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -743,26 +808,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `GPTQMarlinMoEMethod` yet."
-            )
-
         assert activation == "silu", "Only SiLU activation is supported."
 
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
@@ -776,6 +826,8 @@ def apply(
             router_logits,
             topk_weights,
             topk_ids,
+            input_global_scale1=getattr(layer, "w13_input_global_scale", None),
+            input_global_scale2=getattr(layer, "w2_input_global_scale", None),
             quant_type_id=self.quant_type.id,
             apply_router_weight_on_input=apply_router_weight_on_input,
             global_num_experts=global_num_experts,
@@ -786,4 +838,5 @@ def apply(
             sort_indices2=layer.w2_g_idx_sort_indices,
             workspace=layer.workspace,
             is_k_full=self.is_k_full,
+            input_dtype=self.input_dtype,
         )
diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py
index 5fb67c35378b..fad8cb10fa8a 100644
--- a/vllm/model_executor/layers/quantization/hqq_marlin.py
+++ b/vllm/model_executor/layers/quantization/hqq_marlin.py
@@ -351,6 +351,7 @@ def apply(
             bias,
             scales,
             None,
+            None,
             zeros,
             layer.g_idx,
             layer.g_idx_sort_indices,
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index e0234191c62b..a1571afba297 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -52,6 +52,7 @@ def __init__(
         modules_to_not_convert: list[str] | None = None,
         desc_act: bool | None = None,
         lm_head_quantized: bool | None = None,
+        is_sym: bool | None = None,
     ) -> None:
         super().__init__()
         self.method = method
@@ -60,6 +61,7 @@ def __init__(
         self.modules_to_not_convert = modules_to_not_convert or []
         self.desc_act = desc_act
         self.lm_head_quantized = lm_head_quantized
+        self.is_sym = is_sym
         self.pack_factor = 32 // self.weight_bits
 
         if self.weight_bits not in [4]:
@@ -108,21 +110,31 @@ def from_config(cls, config: dict[str, Any]) -> "IPEXConfig":
             modules_to_not_convert = cls.get_from_keys_or(
                 config, ["modules_to_not_convert"], None
             )
+            is_sym = not cls.get_from_keys_or(config, ["zero_point"], default=False)
             return cls(
-                method, weight_bits, group_size, modules_to_not_convert, False, False
+                method,
+                weight_bits,
+                group_size,
+                modules_to_not_convert,
+                False,
+                False,
+                is_sym,
             )
         # otherwise for gptq
         weight_bits = cls.get_from_keys(config, ["bits"])
         group_size = cls.get_from_keys(config, ["group_size"])
         lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False)
         desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False)
-        return cls(method, weight_bits, group_size, [], desc_act, lm_head_quantized)
+        is_sym = cls.get_from_keys_or(config, ["sym"], default=True)
+        return cls(
+            method, weight_bits, group_size, [], desc_act, lm_head_quantized, is_sym
+        )
 
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
     ) -> QuantizationMethods | None:
-        if not current_platform.is_cpu() and not current_platform.is_xpu():
+        if not current_platform.is_xpu():
             return None
 
         quant_method = hf_quant_cfg.get("quant_method", "").lower()
@@ -138,7 +150,10 @@ def get_quant_method(
         if isinstance(layer, LinearBase):
             if self.method == "awq":
                 if is_layer_skipped(
-                    prefix, self.modules_to_not_convert, self.packed_modules_mapping
+                    prefix,
+                    self.modules_to_not_convert,
+                    self.packed_modules_mapping,
+                    skip_with_substr=True,
                 ):
                     return UnquantizedLinearMethod()
                 return IPEXAWQLinearMethod(self)
@@ -180,6 +195,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -200,6 +216,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 bias=bias,
                 group_size=self.quant_config.group_size,
                 quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"],
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
             )
         )
 
@@ -250,6 +267,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # The float activation will be quantized (dynamic, per-token) to INT8.
         act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH
 
+        assert isinstance(self.quant_config, IPEXConfig)
         qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
             weight_dtype=weight_dtype,
             lowp_mode=lowp_mode,
@@ -269,6 +287,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 bias=bias,
                 group_size=self.quant_config.group_size,
                 quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"],  # type: ignore
+                weight_qscheme="sym" if self.quant_config.is_sym else "asym",
             )
         )
 
diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
index ac21286eeffa..faaa45b861de 100644
--- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
+++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py
@@ -9,6 +9,7 @@
     MARLIN_SUPPORTED_GROUP_SIZES,
     apply_gptq_marlin_linear,
     check_marlin_supports_shape,
+    marlin_act_int8_process_scales,
     marlin_is_k_full,
     marlin_make_empty_g_idx,
     marlin_make_workspace_new,
@@ -21,6 +22,7 @@
 )
 from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
 from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
 
 from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
 
@@ -65,6 +67,18 @@ def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         device = getattr(layer, self.w_q_name).device
         c = self.config
+        is_a_8bit = c.act_type is not None and c.act_type.itemsize == 1
+
+        if is_a_8bit:
+            assert c.weight_type == scalar_types.uint4b8, (
+                "W8A8 is not supported by marlin kernel."
+            )
+
+        if c.act_type == torch.float8_e4m3fn:
+            ops.marlin_int4_fp8_preprocess(getattr(layer, self.w_q_name), inplace=True)
+            getattr(layer, self.w_s_name).data = (
+                getattr(layer, self.w_s_name).data * 512
+            )
 
         row_parallel = c.partition_weight_shape[0] != c.full_weight_shape[0]
         self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
@@ -88,6 +102,7 @@ def transform_w_q(x):
                 size_k=c.partition_weight_shape[0],
                 size_n=c.partition_weight_shape[1],
                 num_bits=c.weight_type.size_bits,
+                is_a_8bit=is_a_8bit,
             )
             return x
 
@@ -99,7 +114,22 @@ def transform_w_s(x):
                 size_k=c.partition_weight_shape[0],
                 size_n=c.partition_weight_shape[1],
                 group_size=c.group_size,
+                is_a_8bit=is_a_8bit,
             )
+
+            if c.group_size == -1:
+                num_groups = 1
+            else:
+                num_groups = c.partition_weight_shape[0] // c.group_size
+
+            if c.act_type == torch.int8 and num_groups > 1:
+                x.data, input_global_scale = marlin_act_int8_process_scales(x.data)
+                layer.register_parameter(
+                    "input_global_scale",
+                    torch.nn.Parameter(input_global_scale, requires_grad=False),
+                )
+            else:
+                layer.input_global_scale = None
             return x
 
         if c.has_g_idx:
@@ -129,6 +159,7 @@ def transform_w_s(x):
                     size_k=grouped_k,
                     size_n=c.partition_weight_shape[1],
                     num_bits=c.weight_type.size_bits,
+                    is_a_8bit=is_a_8bit,
                 ),
             )
         else:
@@ -150,6 +181,7 @@ def apply_weights(
 
         # `process_weights_after_loading` will ensure w_zp and w_gidx are not
         #  None for marlin
+
         return apply_gptq_marlin_linear(
             input=x,
             weight=w_q,
@@ -162,5 +194,7 @@ def apply_weights(
             input_size_per_partition=c.partition_weight_shape[0],
             output_size_per_partition=c.partition_weight_shape[1],
             is_k_full=self.is_k_full,
+            input_global_scale=getattr(layer, "input_global_scale", None),
             bias=bias,
+            input_dtype=c.act_type,
         )
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
index 36e4a16c0168..90cbda90adf9 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py
@@ -8,6 +8,8 @@
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.aiter import (
+    AiterBpreshufflePerTokenFp8ScaledMMLinearKernel,
+    AiterCKPerTokenFp8ScaledMMLinearKernel,
     AiterScaledMMLinearKernel,
 )
 from vllm.model_executor.layers.quantization.kernels.scaled_mm.cpu import (
@@ -64,6 +66,8 @@
         ChannelWiseTorchScaledMMLinearKernel,
     ],
     PlatformEnum.ROCM: [
+        AiterBpreshufflePerTokenFp8ScaledMMLinearKernel,
+        AiterCKPerTokenFp8ScaledMMLinearKernel,
         ROCmScaledMMLinearKernel,
         PerTensorTorchScaledMMLinearKernel,
         RowWiseTorchScaledMMLinearKernel,
diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
index 61beb93d29d8..28c5640d319a 100644
--- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
+++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/aiter.py
@@ -2,14 +2,24 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+from collections.abc import Callable
+
 import torch
+from aiter.ops.shuffle import shuffle_weight
 
 from vllm import _custom_ops as ops
 from vllm._aiter_ops import rocm_aiter_ops
+from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
 from .cutlass import CutlassScaledMMLinearKernel
-from .ScaledMMLinearKernel import Int8ScaledMMLinearLayerConfig
+from .ScaledMMLinearKernel import (
+    FP8ScaledMMLinearKernel,
+    FP8ScaledMMLinearLayerConfig,
+    Int8ScaledMMLinearLayerConfig,
+)
+
+logger = init_logger(__name__)
 
 
 class AiterScaledMMLinearKernel(CutlassScaledMMLinearKernel):
@@ -115,3 +125,160 @@ def apply_weights(
         # b to be [N, K]
         # CutlassScaledMMLinearKernel prepare weight `w_q` in [K, N] format
         return rocm_aiter_ops.gemm_a8w8(x_q, w_q.t(), x_s, w_s, bias, out_dtype)
+
+
+class AiterBpreshufflePerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    def get_ouput_padding(self) -> int | None:
+        # PTPC kernels do not require padding.
+        return None
+
+    @classmethod
+    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_rocm():
+            return (False, "AITER bpreshuffle is ROCm-only")
+
+        if not rocm_aiter_ops.is_linear_enabled():
+            return (False, "AITER bpreshuffle is disabled by env var")
+
+        try:
+            import aiter  # noqa: F401
+        except Exception:
+            return (False, "AITER not installed")
+
+        # Check if the configuration is PTPC
+        is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token()
+        is_per_token_activation = (
+            c.activation_quant_key.scale.group_shape.is_per_token()
+        )
+        is_ptpc = is_per_channel_weight and is_per_token_activation
+
+        logger.info_once(f"AiterBpreshuffle: can_implement called. is_ptpc={is_ptpc}")
+
+        if not is_ptpc:
+            return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)")
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        logger.info_once("AiterBpreshuffle: SHUFFLING WEIGHTS NOW.")
+
+        w_q, _, _, _ = self._get_layer_params(layer)
+
+        N = w_q.shape[1]
+        K = w_q.shape[0]
+
+        if N % 16 == 0 and K % 16 == 0:
+            # AITER shuffle_weight expectation [N, K]
+            w_q_nk = w_q.t().contiguous()
+
+            # Execute shuffle
+            shuffled_w_nk = shuffle_weight(w_q_nk, layout=(16, 16))
+
+            del layer.weight
+            layer.register_buffer("weight", shuffled_w_nk)
+
+            logger.info_once("[AiterBpreshuffle: Weight shuffle COMPLETE.")
+
+        else:
+            raise ValueError(
+                f"Weight shape (N={N}, K={K}) not divisible by 16 "
+                "for AITER bpreshuffle."
+            )
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # 1. Obtain parameters
+        w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer)
+        # 2. Dynamic quantization input
+        qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub)
+
+        logger.info_once(
+            "AiterBpreshuffle: apply_weights... ABOUT TO CALL C++ KERNEL..."
+        )
+
+        output = rocm_aiter_ops.gemm_a8w8_bpreshuffle(
+            qinput,
+            w_q,  # Input [N, K] shuffle weights
+            out_dtype=self.config.out_dtype,
+            scale_a=qinput_scale,
+            scale_b=w_s,
+        )
+
+        logger.info_once("AiterBpreshuffle: C++ KERNEL CALL SUCCEEDED.")
+
+        if bias is not None:
+            output.add_(bias)
+        return output
+
+    def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]:
+        return rocm_aiter_ops.gemm_a8w8_bpreshuffle
+
+
+class AiterCKPerTokenFp8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    """
+    AITER PTPC kernel (gemm_a8w8_CK) without pre-shuffling.
+    """
+
+    def get_ouput_padding(self) -> int | None:
+        return None
+
+    @classmethod
+    def can_implement(cls, c: FP8ScaledMMLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_rocm():
+            return (False, "AITER CK is ROCm-only")
+
+        if not rocm_aiter_ops.is_linear_enabled():
+            return (False, "AITER CK is disabled by env var")
+
+        try:
+            import aiter  # noqa: F401
+        except Exception:
+            return (False, "AITER not installed")
+
+        is_per_channel_weight = c.weight_quant_key.scale.group_shape.is_per_token()
+        is_per_token_activation = (
+            c.activation_quant_key.scale.group_shape.is_per_token()
+        )
+        is_ptpc = is_per_channel_weight and is_per_token_activation
+
+        logger.info_once(f"AiterCK: can_implement called. is_ptpc={is_ptpc}")
+
+        if not is_ptpc:
+            return (False, "This kernel only handles Per-Token/Per-Channel (PTPC)")
+
+        return True, None
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        logger.info_once(
+            "AITER CK: process_weights_after_loading... DOING NOTHING (pass)."
+        )
+        pass
+
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        w_q, w_s, x_s, x_s_ub = self._get_layer_params(layer)
+
+        qinput, qinput_scale = self.quant_fp8(x, x_s, x_s_ub)
+
+        logger.info_once(
+            "AiterCK: apply_weights... "
+            "ABOUT TO CALL C++ KERNEL (this is where it hangs)..."
+        )
+
+        output = rocm_aiter_ops.gemm_a8w8(
+            qinput, w_q.t(), qinput_scale, w_s, bias, self.config.out_dtype
+        )
+
+        logger.info_once("AiterCK: C++ KERNEL CALL SUCCEEDED.")
+        return output
+
+    def get_scaled_mm_func(self) -> Callable[..., torch.Tensor]:
+        return rocm_aiter_ops.gemm_a8w8
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 79bcb61dc506..850b00f634b5 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Callable
+from fnmatch import fnmatch
 from typing import TYPE_CHECKING, Any, Optional
 
 import torch
@@ -11,9 +12,9 @@
 import vllm.envs as envs
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig,
     FusedMoEQuantConfig,
     fp8_w8a8_moe_quant_config,
     nvfp4_moe_quant_config,
@@ -40,6 +41,8 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
     build_flashinfer_fp4_cutlass_moe_prepare_finalize,
+    flashinfer_trtllm_fp4_moe,
+    prepare_static_weights_for_trtllm_fp4_moe,
     reorder_w1w3_to_w3w1,
     select_nvfp4_gemm_impl,
 )
@@ -55,6 +58,9 @@
     select_cutlass_fp8_gemm_impl,
     swap_w13_to_w31,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     apply_fp4_marlin_linear,
     is_fp4_marlin_supported,
@@ -87,45 +93,222 @@
 KV_CACHE_QUANT_ALGOS = ["FP8"]
 
 
-class ModelOptFp8Config(QuantizationConfig):
+class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
+    """
+    Supports loading kv-cache scaling factors from FP8 checkpoints.
+    """
+
+    def __init__(self, quant_config: "ModelOptQuantConfigBase"):
+        super().__init__(quant_config)
+
+
+class ModelOptQuantConfigBase(QuantizationConfig):
+    LinearMethodCls: type = LinearMethodBase
+    FusedMoEMethodCls: type = FusedMoEMethodBase
+    KVCacheMethodCls: type = BaseKVCacheMethod
+
+    def __init__(
+        self,
+        exclude_modules: list[str],
+    ):
+        super().__init__()
+        self.exclude_modules: list[str] = exclude_modules
+
+    def is_layer_excluded(self, prefix: str) -> bool:
+        """
+        Check if a layer should be excluded from quantization.
+
+        Handles both exact matching (for fused layers) and ModelOpt wildcard matching.
+
+        The ModelOpt exclude_modules list is a list of wildcards.
+        """
+        if len(self.exclude_modules) == 0:
+            return False
+
+        # First check exact matching with fused layer support
+        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
+            return True
+
+        # TODO: This special hard coded logic is not needed for quantized checkpoints
+        # generated by ModelOpt >= 0.39.0 where they are handled natually by the
+        # exclude_modules config. But need to keep them for loading quantized
+        # checkpoints generated by older versions. Then check substring matching
+        # for patterns not caught by exact match
+        for exclude_module in self.exclude_modules:
+            # Skip exact matches already handled above
+            if exclude_module != prefix and (
+                exclude_module in prefix
+                or (
+                    prefix.startswith("language_model.")
+                    and exclude_module in prefix.removeprefix("language_model.")
+                )
+            ):
+                return True
+
+        # modelopt exclude modules are not simple strings, they are wildcards
+        for wildcard_pattern in self.exclude_modules:
+            if fnmatch(prefix, wildcard_pattern):
+                return True
+
+        return False
+
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        # handle kv-cache first so we can focus only on weight quantization thereafter
+        if isinstance(layer, Attention):
+            return self.KVCacheMethodCls(self)
+
+        # handle exclusion
+        if self.is_layer_excluded(prefix):
+            if isinstance(layer, LinearBase):
+                return UnquantizedLinearMethod()
+            return None
+
+        # TODO: This special hard coded logic is not needed for quantized checkpoints
+        # generated by ModelOpt >= 0.39.0 where they are handled natually by the
+        # exclude_modules config. But need to keep them for loading quantized
+        # checkpoints generated by older versions. Then check substring matching
+        # for patterns not caught by exact match
+        if "vision_tower" in prefix or "vision_model" in prefix:
+            return UnquantizedLinearMethod()
+
+        # now, the layer is quantized, handle it here
+        if isinstance(layer, LinearBase):
+            quant_method = self.LinearMethodCls(self)
+            if getattr(quant_method, "backend", "") == "marlin":
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
+        elif isinstance(layer, FusedMoE):
+            quant_method = self.FusedMoEMethodCls(quant_config=self, layer=layer)
+            if getattr(quant_method, "backend", "") == "marlin":
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+            return quant_method
+
+        return None
+
+    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
+        if len(self.exclude_modules) > 0:
+            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
+
+    @staticmethod
+    def get_config_filenames() -> list[str]:
+        return ["hf_quant_config.json"]
+
+    @classmethod
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        group_size: int | None,
+    ) -> "ModelOptQuantConfigBase":
+        raise NotImplementedError("Please implement this function in sub classes")
+
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "ModelOptQuantConfigBase":
+        # Handle both ModelOpt format and compressed-tensors style format
+        if "quantization" in config:
+            # Traditional ModelOpt format:
+            # {"quantization": {"quant_algo": "..."}}
+            quant_config = cls.get_from_keys(config, ["quantization"])
+            if not isinstance(quant_config, dict):
+                raise ValueError("Expected 'quantization' to be a dictionary in config")
+
+            quant_method = quant_config.get("quant_algo")
+
+            # Handle kv_cache_quant_algo with proper type validation
+            kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
+
+            # Handle group_size with proper type validation
+            group_size_raw = quant_config.get("group_size")
+
+            # "exclude_modules" is the key in the legacy hf_quant_config.json
+            exclude_modules = quant_config.get("exclude_modules", [])
+        else:
+            # Compressed-tensors style format:
+            # {"quant_algo": "...", "quant_method": "modelopt"}
+            quant_method = config.get("quant_algo")
+            kv_cache_quant_method = config.get("kv_cache_quant_algo")
+            # "ignore" is the key in config.json
+            exclude_modules = config.get("ignore", [])
+            group_size_raw = config.get("group_size")
+
+        if not quant_method:
+            raise ValueError("Missing 'quant_algo' in quantization config")
+
+        if kv_cache_quant_method is None:
+            # No KV cache quantization, keep this branch just to have this comment
+            pass
+        elif not isinstance(kv_cache_quant_method, str):
+            raise ValueError(
+                f"kv_cache_quant_algo must be a string, got "
+                f"{type(kv_cache_quant_method)}"
+            )
+
+        if not isinstance(exclude_modules, list):
+            raise ValueError(
+                f"exclude_modules must be a list, got {type(exclude_modules)}"
+            )
+
+        if group_size_raw is None:
+            group_size = None
+        elif isinstance(group_size_raw, int):
+            group_size = group_size_raw
+        else:
+            try:
+                group_size = int(group_size_raw)
+            except (ValueError, TypeError):
+                raise ValueError(
+                    f"group_size must be an integer, got {type(group_size_raw)}"
+                ) from None
+
+        if quant_method not in QUANT_ALGOS:
+            raise ValueError(
+                f"ModelOpt currently only supports: {QUANT_ALGOS} "
+                "quantizations in vLLM. Please check the "
+                "`hf_quant_config.json` file for your model's "
+                "quant configuration."
+            )
+        return cls._from_config(
+            quant_method=quant_method,
+            kv_cache_quant_method=kv_cache_quant_method,
+            exclude_modules=exclude_modules,
+            group_size=group_size,
+            original_config=config,
+        )
+
+
+class ModelOptFp8Config(ModelOptQuantConfigBase):
     """Config class for ModelOpt FP8."""
 
     def __init__(
         self,
-        is_checkpoint_fp8_serialized: bool = False,
-        kv_cache_quant_method: str | None = None,
-        exclude_modules: list[str] | None = None,
+        is_checkpoint_fp8_serialized: bool,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
     ) -> None:
-        super().__init__()
+        super().__init__(exclude_modules)
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
         self.kv_cache_quant_method = kv_cache_quant_method
-        self.exclude_modules = exclude_modules or []
         if is_checkpoint_fp8_serialized:
             logger.warning(
                 "Detected ModelOpt fp8 checkpoint. Please note that"
                 " the format is experimental and could change."
             )
 
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
+    def get_name(self) -> QuantizationMethods:
         return "modelopt"
 
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         return [torch.bfloat16, torch.half]
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 89
 
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["hf_quant_config.json"]
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
-
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
@@ -159,88 +342,19 @@ def override_quantization_method(
         return None
 
     @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config":
-        # Handle both ModelOpt format and compressed-tensors style format
-        if "quantization" in config:
-            # ModelOpt format: {"quantization": {"quant_algo": "..."}}
-            quant_config = cls.get_from_keys(config, ["quantization"])
-            if not isinstance(quant_config, dict):
-                raise ValueError("Expected 'quantization' to be a dictionary in config")
-            quant_method = quant_config.get("quant_algo", "")
-            if not quant_method:
-                raise ValueError("Missing 'quant_algo' in quantization config")
-            kv_cache_quant_method = quant_config.get("kv_cache_quant_algo")
-            # "exclude_modules" is the key in the legacy hf_quant_config.json
-            exclude_modules = quant_config.get("exclude_modules")
-        else:
-            # Compressed-tensors style format:
-            # {"quant_algo": "...", "quant_method": "modelopt"}
-            quant_method = config.get("quant_algo", "")
-            kv_cache_quant_method = config.get("kv_cache_quant_algo")
-            # "ignore" is the key in config.json
-            exclude_modules = config.get("ignore")
-
-        if quant_method not in QUANT_ALGOS:
-            raise ValueError(
-                f"ModelOpt currently only supports: {QUANT_ALGOS} "
-                "quantizations in vLLM. Please check the "
-                "`hf_quant_config.json` file for your model's "
-                "quant configuration."
-            )
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        **kwargs: Any,
+    ) -> "ModelOptFp8Config":
         is_checkpoint_fp8_serialized = "FP8" in quant_method
 
         return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, exclude_modules)
 
-    def is_layer_excluded(self, prefix: str) -> bool:
-        """
-        Check if a layer should be excluded from quantization.
-        Handles both exact matching (for fused layers) and substring matching.
-
-        This method handles both regular models and multimodal models that use
-        the language_model prefix. For multimodal models, it checks if the
-        module name (without the language_model prefix) is in the exclude list.
-        """
-        if self.exclude_modules is None:
-            return False
-
-        # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
-            return True
-
-        # Then check substring matching for patterns not caught by exact match
-        for module in self.exclude_modules:
-            # Skip exact matches already handled above
-            if module != prefix and (
-                module in prefix
-                or (
-                    prefix.startswith("language_model.")
-                    and module in prefix.removeprefix("language_model.")
-                )
-            ):
-                return True
-        return False
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import (  # Avoid circular import
-            Attention,
-            MLAAttention,
-        )
-
-        if isinstance(layer, LinearBase):
-            if self.is_layer_excluded(prefix):
-                return UnquantizedLinearMethod()
-            # Check if this is a vision model layer that should not be quantized
-            if "vision_tower" in prefix or "vision_model" in prefix:
-                return UnquantizedLinearMethod()
-            return ModelOptFp8LinearMethod(self)
-        elif isinstance(layer, (Attention, MLAAttention)):
-            return ModelOptFp8KVCacheMethod(self)
-        elif isinstance(layer, FusedMoE):
-            return ModelOptFp8MoEMethod(self, layer)
-        return None
-
 
 class ModelOptFp8LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer static quantization.
@@ -342,7 +456,7 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptFp8Config,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
     ) -> None:
         super().__init__(layer.moe_config)
         self.layer = layer
@@ -353,18 +467,25 @@ def __init__(
 
         self.cutlass_fp8_supported = cutlass_fp8_supported()
         self.flashinfer_moe_backend: FlashinferMoeBackend | None = None
-        if (
-            envs.VLLM_USE_FLASHINFER_MOE_FP8
-            and has_flashinfer_moe()
-            and self.moe.is_act_and_mul
-        ):
+        if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe():
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
+            if (
+                self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+                and not self.moe.is_act_and_mul
+            ):
+                logger.info_once(
+                    "Non-gated MoE is not supported for min-latency mode,"
+                    "falling back to high-throughput mode"
+                )
+                self.flashinfer_moe_backend = FlashinferMoeBackend.CUTLASS
+
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
             )
 
     def maybe_make_prepare_finalize(
         self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
     ) -> mk.FusedMoEPrepareAndFinalize | None:
         # TRT LLM not supported with all2all yet.
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
@@ -376,7 +497,7 @@ def maybe_make_prepare_finalize(
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -556,10 +677,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             )
 
         if self.flashinfer_moe_backend is not None:
-            layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
-            register_moe_scaling_factors(layer)
+            if self.moe.is_act_and_mul:
+                layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data)
             if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
                 rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight)
+        register_moe_scaling_factors(layer)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -569,19 +691,19 @@ def get_fused_moe_quant_config(
 
         return fp8_w8a8_moe_quant_config(
             w1_scale=layer.w13_weight_scale,
-            g1_alphas=(layer.w13_weight_scale * layer.w13_input_scale).squeeze(),
+            g1_alphas=layer.output1_scales_gate_scalar.squeeze(),
             w2_scale=layer.w2_weight_scale,
-            g2_alphas=(layer.w2_weight_scale * layer.w2_input_scale).squeeze(),
+            g2_alphas=layer.output2_scales_scalar.squeeze(),
             a1_scale=layer.w13_input_scale,
             a1_gscale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
-            a2_gscale=1.0 / layer.w2_input_scale,
+            a2_gscale=layer.w2_input_scale_inv,
             per_act_token_quant=False,
         )
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -602,12 +724,11 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptFp8MoEMethod` yet."
-            )
-
         if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM:
+            if layer.enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptFp8MoEMethod` yet."
+                )
             assert activation == "silu", (
                 f"Expected 'silu' activation but got {activation}"
             )
@@ -625,25 +746,15 @@ def apply(
             )
 
         # Expert selection
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
-            assert not renormalize
-            assert activation == "silu", (
-                f"Expected 'silu' activation but got {activation}"
+            assert activation in ("silu", "relu2_no_mul"), (
+                "Expected activation to be in ('silu', 'relu2_no_mul'),"
+                f"but got {activation}"
             )
             return flashinfer_cutlass_moe_fp8(
                 x,
@@ -676,7 +787,12 @@ def apply(
             )
 
 
-class ModelOptNvFp4Config(QuantizationConfig):
+ModelOptFp8Config.LinearMethodCls = ModelOptFp8LinearMethod
+ModelOptFp8Config.FusedMoEMethodCls = ModelOptFp8MoEMethod
+ModelOptFp8Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
+
+
+class ModelOptNvFp4Config(ModelOptQuantConfigBase):
     """Config class for ModelOpt FP4."""
 
     def __init__(
@@ -686,7 +802,7 @@ def __init__(
         exclude_modules: list[str],
         group_size: int = 16,
     ) -> None:
-        super().__init__()
+        super().__init__(exclude_modules)
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
             logger.warning(
@@ -696,28 +812,17 @@ def __init__(
 
             self.group_size = group_size
             self.kv_cache_quant_algo = kv_cache_quant_algo
-            self.exclude_modules = exclude_modules
 
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
+    def get_name(self) -> QuantizationMethods:
         return "modelopt_fp4"
 
-    @classmethod
-    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+    def get_supported_act_dtypes(self) -> list[torch.dtype]:
         return [torch.bfloat16, torch.half, torch.float8_e4m3fn]
 
     @classmethod
     def get_min_capability(cls) -> int:
         return 80
 
-    @classmethod
-    def get_config_filenames(cls) -> list[str]:
-        return ["hf_quant_config.json"]
-
-    def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"):
-        if self.exclude_modules is not None:
-            self.exclude_modules = hf_to_vllm_mapper.apply_list(self.exclude_modules)
-
     @classmethod
     def override_quantization_method(
         cls, hf_quant_cfg, user_quant
@@ -751,105 +856,25 @@ def override_quantization_method(
         return None
 
     @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
-        # Handle both traditional ModelOpt format and compressed-tensors
-        # style format
-        if "quantization" in config:
-            # Traditional ModelOpt format:
-            # {"quantization": {"quant_algo": "..."}}
-            quant_config = cls.get_from_keys(config, ["quantization"])
-            if not isinstance(quant_config, dict):
-                raise ValueError("Expected 'quantization' to be a dictionary in config")
-
-            quant_method = quant_config.get("quant_algo", "")
-            if not quant_method:
-                raise ValueError("Missing 'quant_algo' in quantization config")
-
-            # Handle kv_cache_quant_algo with proper type validation
-            kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
-            if kv_cache_quant_algo_raw is None:
-                # No KV cache quantization by default
-                kv_cache_quant_algo = None
-            elif isinstance(kv_cache_quant_algo_raw, str):
-                kv_cache_quant_algo = kv_cache_quant_algo_raw
-            else:
-                raise ValueError(
-                    f"kv_cache_quant_algo must be a string, got "
-                    f"{type(kv_cache_quant_algo_raw)}"
-                )
-
-            # Handle group_size with proper type validation
-            group_size_raw = quant_config.get("group_size")
-            if group_size_raw is None:
-                group_size = 16  # Default value
-            elif isinstance(group_size_raw, int):
-                group_size = group_size_raw
-            else:
-                try:
-                    group_size = int(group_size_raw)
-                except (ValueError, TypeError):
-                    raise ValueError(
-                        f"group_size must be an integer, got {type(group_size_raw)}"
-                    ) from None
-
-            # "exclude_modules" is the key in the legacy hf_quant_config.json
-            exclude_modules = quant_config.get("exclude_modules", [])
-            if not isinstance(exclude_modules, list):
-                raise ValueError(
-                    f"exclude_modules must be a list, got {type(exclude_modules)}"
-                )
-        else:
-            # Compressed-tensors style format:
-            # {"quant_algo": "...", "quant_method": "modelopt"}
-            quant_method = config.get("quant_algo", "")
-
-            # Handle kv_cache_quant_algo with proper type validation
-            kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo")
-            if kv_cache_quant_algo_raw is None:
-                # No KV cache quantization by default
-                kv_cache_quant_algo = None
-            elif isinstance(kv_cache_quant_algo_raw, str):
-                kv_cache_quant_algo = kv_cache_quant_algo_raw
-            else:
-                raise ValueError(
-                    f"kv_cache_quant_algo must be a string, got "
-                    f"{type(kv_cache_quant_algo_raw)}"
-                )
-
-            # Handle group_size with proper type validation
-            group_size_raw = config.get("group_size")
-            if group_size_raw is None:
-                group_size = 16  # Default value
-            elif isinstance(group_size_raw, int):
-                group_size = group_size_raw
-            else:
-                try:
-                    group_size = int(group_size_raw)
-                except (ValueError, TypeError):
-                    raise ValueError(
-                        f"group_size must be an integer, got {type(group_size_raw)}"
-                    ) from None
-
-            # "ignore" is the key in config.json
-            exclude_modules = config.get("ignore", [])
-            if not isinstance(exclude_modules, list):
-                raise ValueError(
-                    f"exclude_modules must be a list, got {type(exclude_modules)}"
-                )
-
-        if quant_method not in QUANT_ALGOS:
-            raise ValueError(
-                f"ModelOpt currently only supports: {QUANT_ALGOS} "
-                "quantizations in vLLM. Please check the "
-                "`hf_quant_config.json` file for your model's "
-                "quant configuration."
-            )
+    def _from_config(
+        cls,
+        *,
+        quant_method: str,
+        kv_cache_quant_method: str | None,
+        exclude_modules: list[str],
+        original_config: dict[str, Any],
+        group_size: int | None,
+        **kwargs: Any,
+    ) -> "ModelOptNvFp4Config":
         is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
 
+        if group_size is None:
+            group_size = 16  # Default value
+
         # For FP4, these fields are required
-        if is_checkpoint_nvfp4_serialized and "quantization" in config:
+        if is_checkpoint_nvfp4_serialized and "quantization" in original_config:
             # Check if required fields are present in the quantization config
-            quant_config = config["quantization"]
+            quant_config = original_config["quantization"]
             required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"]
             missing_fields = [
                 field for field in required_fields if field not in quant_config
@@ -862,64 +887,11 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
 
         return cls(
             is_checkpoint_nvfp4_serialized,
-            kv_cache_quant_algo,
+            kv_cache_quant_method,
             exclude_modules,
             group_size,
         )
 
-    def is_layer_excluded(self, prefix: str) -> bool:
-        """
-        Check if a layer should be excluded from quantization.
-        Handles both exact matching (for fused layers) and pattern matching.
-        """
-        # First check exact matching with fused layer support
-        if is_layer_skipped(prefix, self.exclude_modules, self.packed_modules_mapping):
-            return True
-
-        # Check regex pattern matching for patterns not caught by exact match
-        import regex as re
-
-        for pattern in self.exclude_modules:
-            # Skip patterns that would be caught by exact matching
-            if "*" in pattern or "." in pattern:
-                regex_str = pattern.replace(".", r"\.").replace("*", r".*")
-                if re.fullmatch(regex_str, prefix):
-                    return True
-        return False
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import (  # Avoid circular import
-            Attention,
-            MLAAttention,
-        )
-
-        skip_layer = self.is_layer_excluded(prefix)
-        if isinstance(layer, LinearBase):
-            if skip_layer:
-                return UnquantizedLinearMethod()
-            # Check if this is a vision model layer that should not be quantized
-            if "vision_tower" in prefix or "vision_model" in prefix:
-                return UnquantizedLinearMethod()
-            return ModelOptNvFp4LinearMethod(self)
-        elif isinstance(layer, (Attention, MLAAttention)):
-            return ModelOptFp8KVCacheMethod(self)
-        elif isinstance(layer, FusedMoE):
-            if skip_layer:
-                return None
-            return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
-        return None
-
-
-class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
-    """
-    Supports loading kv-cache scaling factors from FP8 checkpoints.
-    """
-
-    def __init__(self, quant_config: ModelOptFp8Config | ModelOptNvFp4Config):
-        super().__init__(quant_config)
-
 
 class ModelOptNvFp4LinearMethod(LinearMethodBase):
     """Linear method for Model Optimizer NVFP4.
@@ -934,6 +906,7 @@ class ModelOptNvFp4LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         self.quant_config = quant_config
+        self.marlin_input_dtype = None
 
         self.backend = "none"
         if envs.VLLM_NVFP4_GEMM_BACKEND is None:
@@ -1101,6 +1074,7 @@ def apply(
                 size_n=layer.output_size_per_partition,
                 size_k=layer.input_size_per_partition,
                 bias=bias,
+                input_dtype=self.marlin_input_dtype,
             )
 
         output_dtype = x.dtype
@@ -1147,30 +1121,36 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase):
     def __init__(
         self,
         quant_config: ModelOptNvFp4Config,
-        moe: FusedMoEConfig,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
     ) -> None:
         from vllm.model_executor.layers.quantization.utils.nvfp4_moe_support import (
             detect_nvfp4_moe_support,  # noqa: E501
         )
 
-        super().__init__(moe)
+        super().__init__(layer.moe_config)
         self.quant_config = quant_config
         self.layer = layer
         _nvfp4 = detect_nvfp4_moe_support(self.__class__.__name__)
         self.cutlass_nvfp4_supported = _nvfp4.cutlass_supported
         self.allow_flashinfer = _nvfp4.allow_flashinfer
         self.use_marlin = _nvfp4.use_marlin
+        self.marlin_input_dtype = None
         self.flashinfer_moe_backend = None
-        self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
         if self.allow_flashinfer:
             self.flashinfer_moe_backend = get_flashinfer_moe_backend()
             logger.info_once(
                 f"Using FlashInfer {self.flashinfer_moe_backend.value} kernels"
                 " for ModelOptNvFp4FusedMoE."
             )
+        elif self.use_marlin:
+            logger.info_once("Using Marlin for ModelOptNvFp4FusedMoE.")
+        else:
+            logger.info_once("Using Cutlass for ModelOptNvFp4FusedMoE.")
 
-    def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
+    def maybe_make_prepare_finalize(
+        self,
+        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    ) -> mk.FusedMoEPrepareAndFinalize | None:
         if self.use_marlin or (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
@@ -1187,7 +1167,7 @@ def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
             logger.debug_once("%s", prepare_finalize.__class__.__name__)
             return prepare_finalize
         else:
-            return super().maybe_make_prepare_finalize()
+            return super().maybe_make_prepare_finalize(routing_tables)
 
     def select_gemm_impl(
         self,
@@ -1235,7 +1215,7 @@ def create_weights(
         w13_weight = ModelWeightParameter(
             data=torch.empty(
                 num_experts,
-                2 * intermediate_size_per_partition,
+                (2 if self.moe.is_act_and_mul else 1) * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // 2,
                 dtype=weight_dtype,
@@ -1264,7 +1244,7 @@ def create_weights(
         w13_weight_scale = ModelWeightParameter(
             data=torch.empty(
                 num_experts,
-                2 * intermediate_size_per_partition,
+                (2 if self.moe.is_act_and_mul else 1) * intermediate_size_per_partition,
                 # 2 fp4 items are packed in the input dimension
                 hidden_size // self.quant_config.group_size,
                 dtype=weight_scale_dtype,
@@ -1294,7 +1274,9 @@ def create_weights(
         )
 
         w13_weight_scale_2 = PerTensorScaleParameter(
-            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            data=torch.empty(
+                num_experts, 2 if self.moe.is_act_and_mul else 1, dtype=torch.float32
+            ),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
@@ -1315,7 +1297,11 @@ def create_weights(
         global_scale_num_experts = global_num_experts if use_global_sf else num_experts
 
         w13_input_scale = PerTensorScaleParameter(
-            data=torch.empty(global_scale_num_experts, 2, dtype=torch.float32),
+            data=torch.empty(
+                global_scale_num_experts,
+                2 if self.moe.is_act_and_mul else 1,
+                dtype=torch.float32,
+            ),
             weight_loader=weight_loader,
         )
         layer.register_parameter("w13_input_scale", w13_input_scale)
@@ -1326,136 +1312,19 @@ def create_weights(
         )
         layer.register_parameter("w2_input_scale", w2_input_scale)
 
-    def prepare_static_weights_for_trtllm_fp4_moe(
-        self,
-        # args_dequant,
-        # args,
-        gemm1_weights,
-        gemm2_weights,
-        gemm1_scales_linear_fp4_bytes,
-        gemm2_scales_linear_fp4_bytes,
-        hidden_size,
-        intermediate_size,
-        num_experts,
-    ):
-        from flashinfer import nvfp4_block_scale_interleave
-        from flashinfer.fused_moe.core import (
-            _maybe_get_cached_w3_w1_permute_indices,
-            get_w2_permute_indices_with_cache,
-        )
-
-        """Prepare quantized weights for kernel (done offline with weights)."""
-        epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
-
-        # Convert quantized weights to proper formats
-        gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, 2 * intermediate_size, hidden_size // 2
-        )  # packed fp4
-        gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn
-        ).reshape(
-            num_experts, 2 * intermediate_size, hidden_size // 16
-        )  # fp8 scaling factors
-
-        gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
-            num_experts, hidden_size, intermediate_size // 2
-        )  # packed fp4
-        gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
-            torch.float8_e4m3fn
-        ).reshape(
-            num_experts, hidden_size, intermediate_size // 16
-        )  # fp8 scaling factors
-
-        gemm1_weights_fp4_shuffled = []
-        gemm1_scales_fp4_shuffled = []
-        gemm2_weights_fp4_shuffled = []
-        gemm2_scales_fp4_shuffled = []
-        for i in range(num_experts):
-            # Calculate the permute indices for the following:
-            # 1. Reorder rows of W1 and scales for fused gated activation
-            # 2. Shuffle weights and scaling factors for transposed mma output
-            # for both w3_w1 and w2 weights and scale factors
-            permute_indices = _maybe_get_cached_w3_w1_permute_indices(
-                self._cache_permute_indices,
-                gemm1_weights_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-            )
-            gemm1_weights_fp4_shuffled.append(
-                gemm1_weights_fp4[i]
-                .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
-                .contiguous()
-            )
-
-            permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
-                self._cache_permute_indices,
-                gemm1_scales_linear_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-                num_elts_per_sf=16,
-            )
-            gemm1_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
-                    gemm1_scales_linear_fp4[i]
-                    .view(torch.uint8)[
-                        permute_sf_indices.to(gemm1_scales_linear_fp4.device)
-                    ]
-                    .contiguous()
-                )
-            )
-
-            permute_indices = get_w2_permute_indices_with_cache(
-                self._cache_permute_indices,
-                gemm2_weights_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-            )
-            gemm2_weights_fp4_shuffled.append(
-                gemm2_weights_fp4[i]
-                .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
-                .contiguous()
-            )
-
-            permute_sf_indices = get_w2_permute_indices_with_cache(
-                self._cache_permute_indices,
-                gemm2_scales_linear_fp4[i].view(torch.uint8),
-                epilogue_tile_m,
-                num_elts_per_sf=16,
-            )
-            gemm2_scales_fp4_shuffled.append(
-                nvfp4_block_scale_interleave(
-                    gemm2_scales_linear_fp4[i]
-                    .view(torch.uint8)[
-                        permute_sf_indices.to(gemm2_scales_linear_fp4.device)
-                    ]
-                    .contiguous()
-                )
-            )
-
-        # Stack weights for all experts
-        gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
-        gemm1_scales_fp4_shuffled = (
-            torch.stack(gemm1_scales_fp4_shuffled)
-            .view(torch.float8_e4m3fn)
-            .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
-        )
-
-        gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
-        gemm2_scales_fp4_shuffled = (
-            torch.stack(gemm2_scales_fp4_shuffled)
-            .view(torch.float8_e4m3fn)
-            .reshape(num_experts, hidden_size, intermediate_size // 16)
-        )
-        return (
-            gemm1_weights_fp4_shuffled,
-            gemm1_scales_fp4_shuffled,
-            gemm2_weights_fp4_shuffled,
-            gemm2_scales_fp4_shuffled,
-        )
-
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # GEMM 1 processing
         gemm1_weight = layer.w13_weight.data
         gemm1_weight_scale = layer.w13_weight_scale.data
 
-        if self.allow_flashinfer:
+        if (
+            self.allow_flashinfer
+            and (
+                self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+                or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
+            )
+            and self.moe.is_act_and_mul
+        ):
             gemm1_weight, gemm1_weight_scale = reorder_w1w3_to_w3w1(
                 gemm1_weight, gemm1_weight_scale, dim=-2
             )
@@ -1464,7 +1333,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         layer.w13_weight_scale = Parameter(gemm1_weight_scale, requires_grad=False)
 
         # Common processing for w13_weight_scale_2
-        if not torch.allclose(
+        if self.moe.is_act_and_mul and not torch.allclose(
             layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
         ):
             logger.warning_once(
@@ -1528,7 +1397,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 gemm1_scales_fp4_shuffled,
                 gemm2_weights_fp4_shuffled,
                 gemm2_scales_fp4_shuffled,
-            ) = self.prepare_static_weights_for_trtllm_fp4_moe(
+            ) = prepare_static_weights_for_trtllm_fp4_moe(
                 layer.w13_weight,
                 layer.w2_weight,
                 layer.w13_weight_scale,
@@ -1577,11 +1446,39 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 w13_blockscale_swizzled, requires_grad=False
             )
 
+            w13_weight = layer.w13_weight
+            intermediate_size_pad = w13_blockscale_swizzled.size(1) - w13_weight.size(1)
+            if intermediate_size_pad:
+                # padding gated activations will require to split w1 and w3
+                # and pad them individually
+                assert not self.moe.is_act_and_mul, (
+                    "The intermediate size required padding, "
+                    "but padding is not implemented for gated activations"
+                )
+
+                layer.w13_weight = Parameter(
+                    torch.nn.functional.pad(
+                        w13_weight, (0, 0, 0, intermediate_size_pad)
+                    ),
+                    requires_grad=False,
+                )
+                layer.w2_weight = Parameter(
+                    torch.nn.functional.pad(
+                        layer.w2_weight, (0, intermediate_size_pad // 2, 0, 0)
+                    ),
+                    requires_grad=False,
+                )
+                layer.w2_weight_scale = Parameter(
+                    torch.nn.functional.pad(
+                        layer.w2_weight_scale, (0, intermediate_size_pad // 16)
+                    ),
+                    requires_grad=False,
+                )
+
             w2_blockscale_swizzled = swizzle_blockscale(layer.w2_weight_scale)
             layer.w2_weight_scale = Parameter(
                 w2_blockscale_swizzled, requires_grad=False
             )
-            layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
@@ -1603,7 +1500,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -1624,89 +1521,38 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
+        if not self.moe.is_act_and_mul:
+            assert (
+                self.allow_flashinfer
+                and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
+            ), (
+                "Non-gated activations are only supported by the"
+                " flashinfer CUTLASS backend for modelopt checkpoints"
             )
-        assert activation == "silu", "Only SiLU activation is supported."
 
         if (
             self.allow_flashinfer
             and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM
         ):
-            import flashinfer
-
-            from vllm.model_executor.models.llama4 import Llama4MoE
-
-            a1_gscale = layer.w13_input_scale_quant
-            (hidden_states_fp4, hidden_states_scale_linear_fp4) = (
-                flashinfer.fp4_quantize(
-                    x,
-                    a1_gscale,
-                    is_sf_swizzled_layout=False,
+            if enable_eplb:
+                raise NotImplementedError(
+                    "EPLB not supported for `ModelOptNvFp4FusedMoE` yet."
                 )
-            )
-            use_llama4_routing = (
-                custom_routing_function is Llama4MoE.custom_routing_function
-            )
-            routing_method_type = flashinfer.RoutingMethodType.DeepSeekV3
-            if use_llama4_routing:
-                routing_method_type = flashinfer.RoutingMethodType.Llama4
-            routing_bias = e_score_correction_bias
-            if routing_bias is not None:
-                routing_bias = routing_bias.to(torch.bfloat16)
-            out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
-                routing_logits=router_logits
-                if use_llama4_routing
-                else router_logits.to(torch.float32),
-                routing_bias=routing_bias,
-                hidden_states=hidden_states_fp4,
-                hidden_states_scale=hidden_states_scale_linear_fp4.view(
-                    torch.float8_e4m3fn
-                ).flatten(),
-                gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
-                gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn
-                ),
-                gemm1_bias=None,
-                gemm1_alpha=None,
-                gemm1_beta=None,
-                gemm1_clamp_limit=None,
-                gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
-                gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
-                    torch.float8_e4m3fn
-                ),
-                gemm2_bias=None,
-                output1_scale_scalar=layer.g1_scale_c.data,
-                output1_scale_gate_scalar=layer.g1_alphas.data,
-                output2_scale_scalar=layer.g2_alphas.data,
-                num_experts=global_num_experts,
+            return flashinfer_trtllm_fp4_moe(
+                layer=layer,
+                x=x,
+                router_logits=router_logits,
                 top_k=top_k,
-                n_group=num_expert_group if num_expert_group is not None else 0,
-                topk_group=topk_group if topk_group is not None else 0,
-                intermediate_size=layer.intermediate_size_per_partition,
-                local_expert_offset=layer.ep_rank * layer.local_num_experts,
-                local_num_experts=layer.local_num_experts,
-                routed_scaling_factor=None,
-                tile_tokens_dim=None,
-                routing_method_type=routing_method_type,
-                do_finalize=True,
-            )[0]
-            return out
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+                global_num_experts=global_num_experts,
+                num_expert_group=num_expert_group,
+                topk_group=topk_group,
+                custom_routing_function=custom_routing_function,
+                e_score_correction_bias=e_score_correction_bias,
+            )
+
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.use_marlin:
@@ -1727,20 +1573,29 @@ def apply(
                 apply_router_weight_on_input=apply_router_weight_on_input,
                 global_num_experts=global_num_experts,
                 expert_map=expert_map,
-                workspace=layer.workspace,
+                input_dtype=self.marlin_input_dtype,
             )
 
-        elif (
-            self.allow_flashinfer
-            and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS
-        ):
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
-                flashinfer_cutlass_moe_fp4,
+        elif self.allow_flashinfer:
+            assert self.flashinfer_moe_backend in (
+                FlashinferMoeBackend.CUTLASS,
+                FlashinferMoeBackend.CUTEDSL,
             )
+            if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS:
+                from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (  # noqa: E501
+                    flashinfer_cutlass_moe_fp4,
+                )
 
-            assert self.moe_quant_config is not None
+                flashinfer_fn_moe_fp4 = flashinfer_cutlass_moe_fp4
+            else:
+                from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (  # noqa: E501
+                    flashinfer_cutedsl_moe_fp4,
+                )
+
+                flashinfer_fn_moe_fp4 = flashinfer_cutedsl_moe_fp4
 
-            return flashinfer_cutlass_moe_fp4(
+            assert self.moe_quant_config is not None
+            return flashinfer_fn_moe_fp4(
                 hidden_states=x,
                 w1=layer.w13_weight,
                 w2=layer.w2_weight,
@@ -1774,3 +1629,8 @@ def apply(
                 k=x.shape[1],
                 e=layer.w13_weight.shape[0],
             )
+
+
+ModelOptNvFp4Config.LinearMethodCls = ModelOptNvFp4LinearMethod
+ModelOptNvFp4Config.FusedMoEMethodCls = ModelOptNvFp4FusedMoE
+ModelOptNvFp4Config.KVCacheMethodCls = ModelOptFp8KVCacheMethod
diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py
index 2090c86f78dc..cf348290a271 100644
--- a/vllm/model_executor/layers/quantization/moe_wna16.py
+++ b/vllm/model_executor/layers/quantization/moe_wna16.py
@@ -359,7 +359,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -380,25 +380,12 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `MoeWNA16Method` yet.")
-
         from vllm.model_executor.layers.fused_moe import fused_experts
 
         assert activation == "silu", "Only SiLU activation is supported."
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_experts(
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 0f69a18a1f3f..5d330e837eea 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -8,6 +8,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.attention.layer import Attention
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (
@@ -29,6 +30,7 @@
 )
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
     OAITritonExperts,
+    UnfusedOAITritonExperts,
 )
 from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
@@ -37,6 +39,9 @@
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    get_marlin_input_dtype,
+)
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
     prepare_moe_fp4_layer_for_marlin,
 )
@@ -82,8 +87,21 @@ def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
     if not current_platform.is_cuda():
         return Mxfp4Backend.NONE
 
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-    return Mxfp4Backend.MARLIN
+    # If FlashInfer is not available, try either Marlin or Triton
+    triton_kernels_supported = (
+        has_triton_kernels()
+        and is_torch_equal_or_newer("2.8.0")
+        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+    )
+    if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
+        logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
+        return Mxfp4Backend.MARLIN
+
+    logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
+    return Mxfp4Backend.TRITON
 
 
 def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
@@ -132,12 +150,15 @@ def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
             )
 
         # If FlashInfer is not available, try either Marlin or Triton
-        if (
-            envs.VLLM_MXFP4_USE_MARLIN
-            or current_platform.get_device_capability()[0] < 9
-            or not has_triton_kernels()
-            or not is_torch_equal_or_newer("2.8.0")
-        ):
+        triton_kernels_supported = (
+            has_triton_kernels()
+            and is_torch_equal_or_newer("2.8.0")
+            # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
+            # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
+            # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
+            and (9, 0) <= current_platform.get_device_capability() < (11, 0)
+        )
+        if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
             logger.info_once("Using Marlin backend")
             return Mxfp4Backend.MARLIN
         else:
@@ -181,8 +202,6 @@ def get_config_filenames(cls) -> list[str]:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if self.ignored_layers and is_layer_skipped(
                 prefix=prefix,
@@ -193,21 +212,25 @@ def get_quant_method(
             # TODO: Add support for MXFP4 Linear Method.
             # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
             # if you are interested in enabling MXFP4 here.
-            logger.warning_once(
+            logger.debug_once(
                 "MXFP4 linear layer is not implemented - falling back to "
-                "UnquantizedLinearMethod."
+                "UnquantizedLinearMethod.",
+                scope="local",
             )
             return UnquantizedLinearMethod()
         elif isinstance(layer, FusedMoE):
             if current_platform.is_xpu():
                 return IpexMxfp4MoEMethod(layer.moe_config)
             else:
-                return Mxfp4MoEMethod(layer.moe_config)
+                quant_method = Mxfp4MoEMethod(layer.moe_config)
+                quant_method.marlin_input_dtype = get_marlin_input_dtype(prefix)
+                return quant_method
         elif isinstance(layer, Attention):
             # TODO: Add support for MXFP4 Attention.
-            logger.warning_once(
+            logger.debug_once(
                 "MXFP4 attention layer is not implemented. "
-                "Skipping quantization for this layer."
+                "Skipping quantization for this layer.",
+                scope="local",
             )
         return None
 
@@ -216,6 +239,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+
+        self.marlin_input_dtype = None
         self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
@@ -381,7 +406,7 @@ def create_weights(
 
     def process_weights_after_loading(self, layer):
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            prepare_moe_fp4_layer_for_marlin(layer)
+            prepare_moe_fp4_layer_for_marlin(layer, input_dtype=self.marlin_input_dtype)
         elif (
             self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
             or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
@@ -755,6 +780,8 @@ def _interleave_mxfp4_cutlass_sm90(w):
 
             self.w13_weight = w13_weight
             self.w2_weight = w2_weight
+            del layer.w13_weight
+            del layer.w2_weight
             layer.w13_weight = w13_weight
             layer.w2_weight = w2_weight
         else:
@@ -848,6 +875,8 @@ def select_gemm_impl(
             elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
                 return MarlinExperts(self.moe_quant_config)
             elif self.mxfp4_backend == Mxfp4Backend.TRITON:
+                if self.moe.is_lora_enabled:
+                    return UnfusedOAITritonExperts(self.moe_quant_config)
                 return OAITritonExperts(self.moe_quant_config)
             else:
                 raise NotImplementedError(
@@ -860,7 +889,7 @@ def allow_inplace(self) -> bool:
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -885,18 +914,9 @@ def apply(
             raise NotImplementedError("EPLB is not supported for mxfp4")
 
         if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                routed_scaling_factor=routed_scaling_factor,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             return fused_marlin_moe(
@@ -917,6 +937,7 @@ def apply(
                 global_num_experts=global_num_experts,
                 activation=activation,
                 expert_map=expert_map,
+                input_dtype=self.marlin_input_dtype,
             )
 
         assert _can_support_mxfp4(
@@ -987,17 +1008,9 @@ def apply(
         ):
             from vllm.utils.flashinfer import flashinfer_cutlass_fused_moe
 
-            topk_weights, topk_ids, _ = FusedMoE.select_experts(
+            topk_weights, topk_ids, _ = layer.select_experts(
                 hidden_states=x,
                 router_logits=router_logits,
-                use_grouped_topk=use_grouped_topk,
-                top_k=top_k,
-                renormalize=renormalize,
-                topk_group=topk_group,
-                num_expert_group=num_expert_group,
-                custom_routing_function=custom_routing_function,
-                scoring_func=scoring_func,
-                e_score_correction_bias=e_score_correction_bias,
             )
 
             # Backend-specific preparation
@@ -1065,8 +1078,8 @@ def apply(
 
             return triton_kernel_moe_forward(
                 hidden_states=x,
-                w1=self.w13_weight,
-                w2=self.w2_weight,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
                 gating_output=router_logits,
                 topk=top_k,
                 renormalize=renormalize,
diff --git a/vllm/model_executor/layers/quantization/petit.py b/vllm/model_executor/layers/quantization/petit.py
index 402cebc38c21..5ccc73166361 100644
--- a/vllm/model_executor/layers/quantization/petit.py
+++ b/vllm/model_executor/layers/quantization/petit.py
@@ -8,6 +8,7 @@
 import torch
 from torch.nn.parameter import Parameter
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (
     LinearBase,
@@ -159,8 +160,6 @@ def is_layer_excluded(self, prefix: str, exclude_modules: list[str]) -> bool:
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         exclude = self.require_exclude_modules()
 
         if isinstance(layer, LinearBase):
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
index c102c52bbe3f..50b098068906 100644
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ b/vllm/model_executor/layers/quantization/ptpc_fp8.py
@@ -7,6 +7,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import _custom_ops as ops
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -67,8 +68,6 @@ def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config":
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         if isinstance(layer, LinearBase):
             if is_layer_skipped(prefix, self.ignored_layers):
                 return UnquantizedLinearMethod()
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 095a66ef10f9..3640e5c45278 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -2,10 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import fnmatch
-from typing import Any, Optional, cast
+from typing import TYPE_CHECKING, Any, Optional, cast
 
 import torch
 
+from vllm.attention.layer import Attention
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
@@ -32,8 +33,12 @@
     deep_compare,
     should_ignore_layer,
 )
+from vllm.model_executor.models.utils import WeightsMapper
 from vllm.platforms import current_platform
 
+if TYPE_CHECKING:
+    from vllm.model_executor.models.utils import WeightsMapper
+
 __all__ = ["QuarkLinearMethod"]
 
 logger = init_logger(__name__)
@@ -68,11 +73,36 @@ def get_min_capability(cls) -> int:
     def get_name(self) -> QuantizationMethods:
         return "quark"
 
+    def apply_vllm_mapper(  # noqa: B027
+        self, hf_to_vllm_mapper: "WeightsMapper"
+    ):
+        """
+        Interface for models to update module names referenced in
+        quantization configs in order to reflect the vllm model structure
+
+        :param hf_to_vllm_mapper: maps from hf model structure (the assumed
+            structure of the qconfig) to vllm model structure
+        """
+        quant_config_with_hf_to_vllm_mapper = {}
+
+        for k, v in self.quant_config.items():
+            if isinstance(v, list):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_list(v)
+            elif isinstance(v, dict):
+                quant_config_with_hf_to_vllm_mapper[k] = hf_to_vllm_mapper.apply_dict(v)
+            else:
+                if isinstance(v, str):
+                    mapped_v_list = hf_to_vllm_mapper.apply_list([v])
+                    if mapped_v_list:
+                        quant_config_with_hf_to_vllm_mapper[k] = mapped_v_list[0]
+                else:
+                    quant_config_with_hf_to_vllm_mapper[k] = v
+
+        self.quant_config = quant_config_with_hf_to_vllm_mapper
+
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        from vllm.attention.layer import Attention  # Avoid circular import
-
         # Check if the layer is skipped for quantization.
         exclude_layers = cast(list[str], self.quant_config.get("exclude"))
         if should_ignore_layer(
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 30772c3665b0..8be0299eaa66 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -334,7 +334,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -355,24 +355,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkW8A8Fp8MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if self.rocm_aiter_moe_enabled:
@@ -609,7 +594,7 @@ def allow_inplace(self) -> bool:
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -630,24 +615,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError(
-                "EPLB not supported for `QuarkOCP_MX_MoEMethod` yet."
-            )
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         if not self.emulate:
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 007e78e68d5c..eeb60023dc0e 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -10,6 +10,7 @@
 import torch.nn.functional as F
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
     dequant_mxfp4,
@@ -36,7 +37,7 @@
 # use `rocm_aiter_ops.is_asm_fp4_gemm_dynamic_quant_enabled()`
 # for envs checks which does not require @cache anymore.
 # triton kernel is torch compile compatible.
-# does not require direct registeration.
+# does not require direct registration.
 # use `rocm_aiter_ops.triton_fp4_gemm_dynamic_qaunt`.
 @cache
 def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
@@ -49,7 +50,10 @@ def is_rocm_aiter_fp4_asm_gemm_enabled() -> bool:
 
 try:
     from aiter.ops.shuffle import shuffle_weight
-    from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
+    from aiter.ops.triton.gemm_afp4wfp4 import (
+        gemm_afp4wfp4,
+        gemm_afp4wfp4_preshuffled_weight_scales,
+    )
     from aiter.ops.triton.quant import dynamic_mxfp4_quant
 
     from vllm.utils.torch_utils import direct_register_custom_op
@@ -66,23 +70,56 @@ def gemm_with_dynamic_quant(
         x_scales: torch.Tensor | None = None,
     ) -> torch.Tensor:
         M = x.shape[0]
+        N = weight.shape[0]
+        K = weight.shape[1]
         if rocm_use_aiter_fp4_asm_gemm:
-            if x_scales is None:
-                # use hip quant kernel for performance
-                x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+            if M <= 64 and rocm_aiter_ops.is_triton_gemm_afp4wfp4_presh_ws_tuned(N, K):
+                if x_scales is None:
+                    # use hip quant kernel for performance
+                    if M >= 32:
+                        x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+                    else:
+                        x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=False)
+                else:
+                    x_q = x
+                    x_s = x_scales
+
+                if M >= 32:
+                    x_s = x_s.view(torch.uint8).view(x_s.shape[0] // 32, -1)
+                else:
+                    x_s = x_s[:M, ...].view(torch.uint8)
+
+                y = torch.empty(M, N, device=x_q.device, dtype=out_dtype)
+                gemm_afp4wfp4_preshuffled_weight_scales(
+                    x_q.view(torch.uint8),
+                    weight.view(torch.uint8).view(weight.shape[0] // 16, -1),
+                    x_s,
+                    weight_scale.view(torch.uint8).view(
+                        weight_scale.shape[0] // 32, -1
+                    ),
+                    out_dtype,
+                    y,
+                )
             else:
-                x_q = x
-                x_s = x_scales
-
-            # 32 alignment is enough for dim0 padding of output for
-            # gemm_a4w4 kernel
-            y = torch.empty(
-                (M + 31) // 32 * 32, weight.shape[0], device=x_q.device, dtype=out_dtype
-            )
+                if x_scales is None:
+                    # use hip quant kernel for performance
+                    x_q, x_s = per_1x32_f4_quant_hip(x, shuffle=True)
+                else:
+                    x_q = x
+                    x_s = x_scales
+
+                # 32 alignment is enough for dim0 padding of output for
+                # gemm_a4w4 kernel
+                y = torch.empty(
+                    (M + 31) // 32 * 32,
+                    weight.shape[0],
+                    device=x_q.device,
+                    dtype=out_dtype,
+                )
 
-            gemm_a4w4(
-                x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True
-            )
+                gemm_a4w4(
+                    x_q, weight, x_s, weight_scale.view(x_s.dtype), y, bpreshuffle=True
+                )
             return y[:M]
         else:
             if x_scales is None:
diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py
index 52656263a601..7b51b828009f 100644
--- a/vllm/model_executor/layers/quantization/rtn.py
+++ b/vllm/model_executor/layers/quantization/rtn.py
@@ -356,7 +356,7 @@ def get_fused_moe_quant_config(
 
     def apply(
         self,
-        layer: torch.nn.Module,
+        layer: FusedMoE,
         x: torch.Tensor,
         router_logits: torch.Tensor,
         top_k: int,
@@ -377,22 +377,9 @@ def apply(
         logical_to_physical_map: torch.Tensor | None = None,
         logical_replica_count: torch.Tensor | None = None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
-        if enable_eplb:
-            raise NotImplementedError("EPLB not supported for `RTNMoEMethod` yet.")
-
-        topk_weights, topk_ids, _ = FusedMoE.select_experts(
+        topk_weights, topk_ids, _ = layer.select_experts(
             hidden_states=x,
             router_logits=router_logits,
-            use_grouped_topk=use_grouped_topk,
-            top_k=top_k,
-            renormalize=renormalize,
-            topk_group=topk_group,
-            num_expert_group=num_expert_group,
-            custom_routing_function=custom_routing_function,
-            scoring_func=scoring_func,
-            routed_scaling_factor=routed_scaling_factor,
-            e_score_correction_bias=e_score_correction_bias,
-            indices_type=self.topk_indices_dtype,
         )
 
         return fused_marlin_moe(
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..6b2c1dc1312b
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=10240,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..b0eaf02a541a
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=25600,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..4cd357d5086c
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=5120,K=8192,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..ca2179ddf3d2
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/configs/N=51200,K=5120,device_name=NVIDIA_L40S,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 5
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
+    },
+    "256": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "512": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 4
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
index fdf330329e20..eda40657b1e3 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py
@@ -9,6 +9,10 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
+)
+from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+    FlashInferCuteDSLExperts,
 )
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
     FlashInferExperts,
@@ -17,10 +21,14 @@
     create_flashinfer_prepare_finalize,
 )
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe
+from vllm.utils.flashinfer import (
+    has_flashinfer_cutedsl_grouped_gemm_nt_masked,
+    has_flashinfer_cutlass_fused_moe,
+)
 
 __all__ = [
     "is_flashinfer_fp4_cutlass_moe_available",
+    "is_flashinfer_fp4_cutedsl_moe_available",
     "reorder_w1w3_to_w3w1",
     "build_flashinfer_fp4_cutlass_moe_prepare_finalize",
 ]
@@ -36,6 +44,16 @@ def is_flashinfer_fp4_cutlass_moe_available() -> bool:
     )
 
 
+def is_flashinfer_fp4_cutedsl_moe_available() -> bool:
+    """Return ``True`` when FlashInfer CUTEDSL NV-FP4 kernels can be used."""
+    return (
+        envs.VLLM_USE_FLASHINFER_MOE_FP4
+        and has_flashinfer_cutedsl_grouped_gemm_nt_masked()
+        and current_platform.is_cuda()
+        and current_platform.is_device_capability(100)
+    )
+
+
 def reorder_w1w3_to_w3w1(
     weight: torch.Tensor, scale: torch.Tensor, dim: int = -2
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -72,18 +90,244 @@ def select_nvfp4_gemm_impl(
     """Return a GEMM *experts* implementation for NV-FP4 fused-MoE layers"""
 
     if allow_flashinfer:
-        return FlashInferExperts(
-            out_dtype=moe.in_dtype,
-            quant_config=moe_quant_config,
-            ep_rank=moe.moe_parallel_config.ep_rank,
-            ep_size=moe.moe_parallel_config.ep_size,
-            tp_rank=moe.moe_parallel_config.tp_rank,
-            tp_size=moe.moe_parallel_config.tp_size,
-            use_dp=moe.moe_parallel_config.dp_size > 1,
-        )
+        if envs.VLLM_FLASHINFER_MOE_BACKEND == "masked_gemm":
+            return FlashInferCuteDSLExperts(
+                out_dtype=moe.in_dtype,
+                quant_config=moe_quant_config,
+            )
+        elif envs.VLLM_FLASHINFER_MOE_BACKEND == "throughput":
+            return FlashInferExperts(
+                out_dtype=moe.in_dtype,
+                quant_config=moe_quant_config,
+                ep_rank=moe.moe_parallel_config.ep_rank,
+                ep_size=moe.moe_parallel_config.ep_size,
+                tp_rank=moe.moe_parallel_config.tp_rank,
+                tp_size=moe.moe_parallel_config.tp_size,
+                use_dp=moe.moe_parallel_config.dp_size > 1,
+            )
 
     # native cutlass experts currently don't support DP; TP case won't call this
     raise ValueError(
         "CutlassExpertsFp4 doesn't support DP. Use flashinfer CUTLASS "
         "Fused MoE backend instead (set VLLM_USE_FLASHINFER_MOE_FP4=1)"
     )
+
+
+def prepare_static_weights_for_trtllm_fp4_moe(
+    # args_dequant,
+    # args,
+    gemm1_weights,
+    gemm2_weights,
+    gemm1_scales_linear_fp4_bytes,
+    gemm2_scales_linear_fp4_bytes,
+    hidden_size,
+    intermediate_size,
+    num_experts,
+):
+    from flashinfer import nvfp4_block_scale_interleave
+    from flashinfer.fused_moe.core import (
+        _maybe_get_cached_w3_w1_permute_indices,
+        get_w2_permute_indices_with_cache,
+    )
+
+    _cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
+    """Prepare quantized weights for kernel (done offline with weights)."""
+    epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
+
+    # Convert quantized weights to proper formats
+    gemm1_weights_fp4 = gemm1_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 2
+    )  # packed fp4
+    gemm1_scales_linear_fp4 = gemm1_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(
+        num_experts, 2 * intermediate_size, hidden_size // 16
+    )  # fp8 scaling factors
+
+    gemm2_weights_fp4 = gemm2_weights.view(torch.float8_e4m3fn).reshape(
+        num_experts, hidden_size, intermediate_size // 2
+    )  # packed fp4
+    gemm2_scales_linear_fp4 = gemm2_scales_linear_fp4_bytes.view(
+        torch.float8_e4m3fn
+    ).reshape(num_experts, hidden_size, intermediate_size // 16)  # fp8 scaling factors
+
+    gemm1_weights_fp4_shuffled = []
+    gemm1_scales_fp4_shuffled = []
+    gemm2_weights_fp4_shuffled = []
+    gemm2_scales_fp4_shuffled = []
+    for i in range(num_experts):
+        # Calculate the permute indices for the following:
+        # 1. Reorder rows of W1 and scales for fused gated activation
+        # 2. Shuffle weights and scaling factors for transposed mma output
+        # for both w3_w1 and w2 weights and scale factors
+        permute_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm1_weights_fp4_shuffled.append(
+            gemm1_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm1_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = _maybe_get_cached_w3_w1_permute_indices(
+            _cache_permute_indices,
+            gemm1_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm1_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm1_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm1_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+        permute_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_weights_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+        )
+        gemm2_weights_fp4_shuffled.append(
+            gemm2_weights_fp4[i]
+            .view(torch.uint8)[permute_indices.to(gemm2_weights_fp4.device)]
+            .contiguous()
+        )
+
+        permute_sf_indices = get_w2_permute_indices_with_cache(
+            _cache_permute_indices,
+            gemm2_scales_linear_fp4[i].view(torch.uint8),
+            epilogue_tile_m,
+            num_elts_per_sf=16,
+        )
+        gemm2_scales_fp4_shuffled.append(
+            nvfp4_block_scale_interleave(
+                gemm2_scales_linear_fp4[i]
+                .view(torch.uint8)[
+                    permute_sf_indices.to(gemm2_scales_linear_fp4.device)
+                ]
+                .contiguous()
+            )
+        )
+
+    # Stack weights for all experts
+    gemm1_weights_fp4_shuffled = torch.stack(gemm1_weights_fp4_shuffled)
+    gemm1_scales_fp4_shuffled = (
+        torch.stack(gemm1_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, 2 * intermediate_size, hidden_size // 16)
+    )
+
+    gemm2_weights_fp4_shuffled = torch.stack(gemm2_weights_fp4_shuffled)
+    gemm2_scales_fp4_shuffled = (
+        torch.stack(gemm2_scales_fp4_shuffled)
+        .view(torch.float8_e4m3fn)
+        .reshape(num_experts, hidden_size, intermediate_size // 16)
+    )
+    return (
+        gemm1_weights_fp4_shuffled,
+        gemm1_scales_fp4_shuffled,
+        gemm2_weights_fp4_shuffled,
+        gemm2_scales_fp4_shuffled,
+    )
+
+
+def flashinfer_trtllm_fp4_moe(
+    layer: torch.nn.Module,
+    x: torch.Tensor,
+    router_logits: torch.Tensor,
+    top_k: int,
+    global_num_experts: int,
+    num_expert_group: int | None,
+    topk_group: int | None,
+    custom_routing_function: object | None,
+    e_score_correction_bias: torch.Tensor | None,
+) -> torch.Tensor:
+    """
+    Apply FlashInfer TensorRT-LLM FP4 MoE kernel.
+
+    Args:
+        layer: The MoE layer with weights and scales
+        x: Input tensor
+        router_logits: Router logits for expert selection
+        top_k: Number of experts to select per token
+        global_num_experts: Total number of experts across all ranks
+        num_expert_group: Number of expert groups (for grouped routing)
+        topk_group: Top-k within each group
+        custom_routing_function: Custom routing function (e.g., Llama4)
+        e_score_correction_bias: Optional routing bias correction
+
+    Returns:
+        Output tensor from the MoE layer
+    """
+    import flashinfer
+
+    from vllm.model_executor.models.llama4 import Llama4MoE
+
+    # Quantize input to FP4
+    a1_gscale = layer.w13_input_scale_quant
+    (hidden_states_fp4, hidden_states_scale_linear_fp4) = flashinfer.fp4_quantize(
+        x,
+        a1_gscale,
+        is_sf_swizzled_layout=False,
+    )
+
+    # Determine routing method type
+    use_llama4_routing = custom_routing_function is Llama4MoE.custom_routing_function
+    routing_method_type = layer.routing_method_type
+    if use_llama4_routing:
+        routing_method_type = flashinfer.RoutingMethodType.Llama4
+
+    # Prepare routing bias
+    routing_bias = e_score_correction_bias
+    if routing_bias is not None:
+        routing_bias = routing_bias.to(torch.bfloat16)
+
+    router_logits = (
+        router_logits.to(torch.float32)
+        if routing_method_type == RoutingMethodType.DeepSeekV3
+        else router_logits
+    )
+
+    # Call TRT-LLM FP4 block-scale MoE kernel
+    out = flashinfer.fused_moe.trtllm_fp4_block_scale_moe(
+        routing_logits=router_logits,
+        routing_bias=routing_bias,
+        hidden_states=hidden_states_fp4,
+        hidden_states_scale=hidden_states_scale_linear_fp4.view(
+            torch.float8_e4m3fn
+        ).flatten(),
+        gemm1_weights=layer.gemm1_weights_fp4_shuffled.data,
+        gemm1_weights_scale=layer.gemm1_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm1_bias=None,
+        gemm1_alpha=None,
+        gemm1_beta=None,
+        gemm1_clamp_limit=None,
+        gemm2_weights=layer.gemm2_weights_fp4_shuffled.data,
+        gemm2_weights_scale=layer.gemm2_scales_fp4_shuffled.data.view(
+            torch.float8_e4m3fn
+        ),
+        gemm2_bias=None,
+        output1_scale_scalar=layer.g1_scale_c.data,
+        output1_scale_gate_scalar=layer.g1_alphas.data,
+        output2_scale_scalar=layer.g2_alphas.data,
+        num_experts=global_num_experts,
+        top_k=top_k,
+        n_group=num_expert_group if num_expert_group is not None else 0,
+        topk_group=topk_group if topk_group is not None else 0,
+        intermediate_size=layer.intermediate_size_per_partition,
+        local_expert_offset=layer.ep_rank * layer.local_num_experts,
+        local_num_experts=layer.local_num_experts,
+        routed_scaling_factor=None,
+        tile_tokens_dim=None,
+        routing_method_type=routing_method_type,
+        do_finalize=True,
+    )[0]
+
+    return out
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
index e49d374f154d..eef7a0896c37 100644
--- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import (  # noqa: E501
     create_flashinfer_prepare_finalize,
 )
+from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 
@@ -24,6 +25,7 @@
 class FlashinferMoeBackend(Enum):
     TENSORRT_LLM = "TensorRT-LLM"
     CUTLASS = "CUTLASS"
+    CUTEDSL = "CUTEDSL"
 
 
 def calculate_tile_tokens_dim(num_tokens, top_k, num_experts):
@@ -190,17 +192,22 @@ def register_moe_scaling_factors(layer: torch.nn.Module) -> None:
 
 
 def build_flashinfer_fp8_cutlass_moe_prepare_finalize(
-    moe: FusedMoEConfig | None,
+    moe: FusedMoEConfig | None, use_deepseek_fp8_block_scale: bool = False
 ) -> mk.FusedMoEPrepareAndFinalize:
     """Create a FlashInfer CUTLASS fused-MoE prepare finalize kernel"""
     use_dp = moe.moe_parallel_config.dp_size > 1 if moe is not None else False
-    return create_flashinfer_prepare_finalize(use_dp)
+    # Propagate block-scale flag so prepare/finalize can skip act quantization
+    # and inform the kernel to consume per-block weight scales.
+    return create_flashinfer_prepare_finalize(
+        use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+    )
 
 
 def select_cutlass_fp8_gemm_impl(
     moe: FusedMoEConfig | None,
     quant_config: FusedMoEQuantConfig,
     out_dtype: torch.dtype | None = None,
+    use_deepseek_fp8_block_scale: bool = False,
 ) -> mk.FusedMoEPermuteExpertsUnpermute:
     """Return a GEMM *experts* implementation for fused-MoE layers"""
 
@@ -212,12 +219,14 @@ def select_cutlass_fp8_gemm_impl(
             ep_size=moe.moe_parallel_config.ep_size,
             tp_rank=moe.moe_parallel_config.tp_rank,
             tp_size=moe.moe_parallel_config.tp_size,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         )
 
     assert out_dtype is not None, "If moe config is None, out_dtype must be passed"
     return FlashInferExperts(
         out_dtype=out_dtype,
         quant_config=quant_config,
+        use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
     )
 
 
@@ -231,14 +240,22 @@ def flashinfer_cutlass_moe_fp8(
     global_num_experts: int = -1,
     expert_map: torch.Tensor | None = None,
     apply_router_weight_on_input: bool = False,
+    use_deepseek_fp8_block_scale: bool = False,
+    moe: FusedMoEConfig | None = None,
 ) -> torch.Tensor:
     quant_config = layer.quant_method.get_fused_moe_quant_config(layer)
     assert quant_config is not None
 
+    # Construct modular kernel with block-scale support when requested.
     fused_experts = mk.FusedMoEModularKernel(
-        build_flashinfer_fp8_cutlass_moe_prepare_finalize(moe=None),
+        build_flashinfer_fp8_cutlass_moe_prepare_finalize(
+            moe=moe, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale
+        ),
         select_cutlass_fp8_gemm_impl(
-            moe=None, quant_config=quant_config, out_dtype=hidden_states.dtype
+            moe=moe,
+            quant_config=quant_config,
+            out_dtype=hidden_states.dtype,
+            use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
         ),
     )
 
@@ -257,20 +274,38 @@ def flashinfer_cutlass_moe_fp8(
 
 
 def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
+    backend_map = {
+        "throughput": FlashinferMoeBackend.CUTLASS,
+        "latency": FlashinferMoeBackend.TENSORRT_LLM,
+        "masked_gemm": FlashinferMoeBackend.CUTEDSL,
+    }
+
     flashinfer_moe_backend = envs.VLLM_FLASHINFER_MOE_BACKEND
-    if flashinfer_moe_backend == "throughput":
+    if flashinfer_moe_backend in backend_map:
+        if (
+            flashinfer_moe_backend == "latency"
+            and not current_platform.is_device_capability(100)
+        ):
+            logger.info_once(
+                "Flashinfer TRTLLM MOE backend is only supported on "
+                "SM100 and later, using CUTLASS backend instead",
+                scope="local",
+            )
+            return FlashinferMoeBackend.CUTLASS
+        return backend_map[flashinfer_moe_backend]
+    elif current_platform.is_device_capability(90):
         return FlashinferMoeBackend.CUTLASS
-    elif flashinfer_moe_backend == "latency":
-        return FlashinferMoeBackend.TENSORRT_LLM
 
-    allowed_backends = ["throughput", "latency"]
     raise ValueError(
-        f"Unknown flashinfer moe backend: {flashinfer_moe_backend}"
-        f" expected one of {allowed_backends}"
+        f"Unknown flashinfer moe backend: {flashinfer_moe_backend!r}. "
+        f"Expected one of {list(backend_map.keys())}."
     )
 
 
 def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
     # TODO(shuw@nvidia): Update when new backends are added.
-    backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,)
+    backends_supporting_global_sf = (
+        FlashinferMoeBackend.CUTLASS,
+        FlashinferMoeBackend.TENSORRT_LLM,
+    )
     return backend in backends_supporting_global_sf
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 541c6c631053..ae63b4a76726 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -342,7 +342,7 @@ def _run_aiter(
             )
         # MI300 uses tuned AITER ASM/C++ kernel
         else:
-            q_input, input_scale = rocm_aiter_ops.per_1x128_fp8_quant(input_2d)
+            q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d)
 
         return gemm_a8w8_blockscale_op(
             q_input,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 071fb4ba1686..14337ee1d7be 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -9,6 +9,11 @@
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    per_token_quant_int8,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 from vllm.platforms import current_platform
 from vllm.scalar_type import ScalarType, scalar_types
 
@@ -286,10 +291,10 @@ def get_scale_perms():
 
 
 def marlin_permute_scales(
-    s: torch.Tensor, size_k: int, size_n: int, group_size: int
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int, is_a_8bit: bool = False
 ) -> torch.Tensor:
     scale_perm, scale_perm_single = get_scale_perms()
-    if group_size < size_k and group_size != -1:
+    if group_size < size_k and group_size != -1 and not is_a_8bit:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
     else:
         s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
@@ -305,11 +310,15 @@ def marlin_permute_bias(s: torch.Tensor) -> torch.Tensor:
     return s.reshape(*origin_shape).contiguous()
 
 
+def marlin_act_int8_process_scales(s: torch.Tensor):
+    a_scales_scale_factor = 1 / 4096 * s.max().float()
+    s = s / s.max() * 4096
+    s = s.round().to(torch.int16).view(s.dtype)
+    return s, a_scales_scale_factor
+
+
 def marlin_moe_permute_scales(
-    s: torch.Tensor,
-    size_k: int,
-    size_n: int,
-    group_size: int,
+    s: torch.Tensor, size_k: int, size_n: int, group_size: int, is_a_8bit: bool = False
 ):
     num_experts = s.shape[0]
     output = torch.empty(
@@ -319,12 +328,12 @@ def marlin_moe_permute_scales(
     )
 
     for e in range(num_experts):
-        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size, is_a_8bit)
     return output
 
 
 def marlin_zero_points(
-    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    zp: torch.Tensor, size_k: int, size_n: int, num_bits: int, is_a_8bit: bool = False
 ) -> torch.Tensor:
     # Permute zero-points in a similar way to scales, but do not use the
     # "single" permutation, since zero-points are applied on every MMA
@@ -339,7 +348,8 @@ def marlin_zero_points(
     else:
         raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
 
-    zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
+    if not is_a_8bit:
+        zp = zp.reshape((-1, len(interleave)))[:, interleave].ravel()
     zp = zp.reshape((-1, size_n)).contiguous()
     zp = pack_cols(zp, num_bits, size_k, size_n)
 
@@ -347,7 +357,11 @@ def marlin_zero_points(
 
 
 def awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    q_zp_packed: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ) -> torch.Tensor:
     # AWQ zero-points are quantized and packed on the column dim.
     # In addition, the values are permuted based on dequantizer.
@@ -366,12 +380,16 @@ def awq_to_marlin_zero_points(
     q_zp = q_zp.reshape((-1, len(undo_interleave)))[:, undo_interleave].ravel()
     q_zp = q_zp.reshape((-1, size_n)).contiguous()
 
-    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits)
+    marlin_zp = marlin_zero_points(q_zp, size_k, size_n, num_bits, is_a_8bit)
     return marlin_zp
 
 
 def moe_awq_to_marlin_zero_points(
-    q_zp_packed: torch.Tensor, size_k: int, size_n: int, num_bits: int
+    q_zp_packed: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+    is_a_8bit: bool = False,
 ):
     num_experts = q_zp_packed.shape[0]
     output = torch.empty(
@@ -380,7 +398,9 @@ def moe_awq_to_marlin_zero_points(
         dtype=q_zp_packed.dtype,
     )
     for e in range(num_experts):
-        output[e] = awq_to_marlin_zero_points(q_zp_packed[e], size_k, size_n, num_bits)
+        output[e] = awq_to_marlin_zero_points(
+            q_zp_packed[e], size_k, size_n, num_bits, is_a_8bit
+        )
     return output
 
 
@@ -432,6 +452,48 @@ def should_use_atomic_add_reduce(
     return True
 
 
+_quant_fp8_method: QuantFP8 | None = None
+
+
+def get__quant_fp8_method() -> QuantFP8:
+    global _quant_fp8_method
+    if _quant_fp8_method is None:
+        _quant_fp8_method = QuantFP8(False, GroupShape.PER_TOKEN)
+    return _quant_fp8_method
+
+
+def get_marlin_input_dtype(prefix):
+    if envs.VLLM_MARLIN_INPUT_DTYPE is None:
+        return
+    elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "int8":
+        return torch.int8
+    elif envs.VLLM_MARLIN_INPUT_DTYPE.lower() == "fp8":
+        if not current_platform.is_device_capability(
+            89
+        ) and not current_platform.is_device_capability(120):
+            raise ValueError(
+                "Marlin W4A8-FP8 only support SM89 or SM120 device "
+                "(It is slower than Marlin W4A16 on other devices). "
+                "You can consider using W4A8-INT8 instead"
+                "(set VLLM_MARLIN_INPUT_DTYPE=int8)."
+            )
+
+        _ = get__quant_fp8_method()
+        return torch.float8_e4m3fn
+    else:
+        return
+
+
+def marlin_quant_input(x: torch.Tensor, quant_dtype: torch.dtype):
+    x = x.reshape(-1, x.shape[-1])
+    if quant_dtype == torch.int8:
+        return per_token_quant_int8(x)
+    elif quant_dtype == torch.float8_e4m3fn:
+        return get__quant_fp8_method()(x)
+    else:
+        raise ValueError(f"unsupported quant_dtype {quant_dtype}")
+
+
 def apply_gptq_marlin_linear(
     input: torch.Tensor,
     weight: torch.Tensor,
@@ -444,8 +506,10 @@ def apply_gptq_marlin_linear(
     output_size_per_partition: int,
     input_size_per_partition: int,
     is_k_full: bool,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -458,12 +522,27 @@ def apply_gptq_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        assert wtype == scalar_types.uint4b8, (
+            "W8A8-INT8 is not supported by marlin kernel."
+        )
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        assert wtype == scalar_types.uint4b8, (
+            "INT8 weight + FP8 activation is not supported."
+        )
+
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         weight_zp,
         g_idx,
@@ -493,8 +572,10 @@ def apply_awq_marlin_linear(
     quant_type: ScalarType,
     output_size_per_partition: int,
     input_size_per_partition: int,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -507,12 +588,20 @@ def apply_awq_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         weight_zp,
         g_idx,
@@ -538,8 +627,10 @@ def apply_rtn_marlin_linear(
     quant_type: ScalarType,
     output_size_per_partition: int,
     input_size_per_partition: int,
+    input_global_scale: torch.Tensor | None = None,
     bias: torch.Tensor | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
+    input_dtype: torch.dtype | None = None,
 ) -> torch.Tensor:
     reshaped_x = input.reshape(-1, input.shape[-1])
     out_shape = input.shape[:-1] + (output_size_per_partition,)
@@ -552,12 +643,20 @@ def apply_rtn_marlin_linear(
         dtype=input.dtype,
     )
 
+    a_scales = None
+    if input_dtype == torch.int8:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+        a_scales = a_scales * input_global_scale
+    elif input_dtype == torch.float8_e4m3fn:
+        reshaped_x, a_scales = marlin_quant_input(reshaped_x, input_dtype)
+
     output = ops.gptq_marlin_gemm(
         reshaped_x,
         None,
         weight,
         bias,
         weight_scale,
+        a_scales,
         None,
         None,
         None,
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index 842fb9b62267..b94d5bbf3654 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -11,6 +11,7 @@
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
+    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.platforms import current_platform
@@ -37,12 +38,6 @@ def nvfp4_marlin_process_scales(marlin_scales):
     # convert to half first, we would convert to fp8 later
     marlin_scales = marlin_scales.to(torch.half)
 
-    # 8 is the number of scale number using by one thread
-    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
-    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1
-    )
-
     # fit the layout of fp8 dequantization
     marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
         marlin_scales.size(0), -1
@@ -62,18 +57,20 @@ def nvfp4_marlin_process_scales(marlin_scales):
     return marlin_scales
 
 
-def mxfp4_marlin_process_scales(marlin_scales):
-    # 8 is the number of scale number using by one thread
-    marlin_scales = marlin_scales.view(marlin_scales.size(0) // 2, 2, -1, 8)
-    marlin_scales = marlin_scales.permute(0, 2, 1, 3).reshape(
-        marlin_scales.size(0) * 2, -1
-    )
-
+def mxfp4_marlin_process_scales(marlin_scales, input_dtype=None):
     # fit the layout of fp8 dequantization
-    marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
-        marlin_scales.size(0), -1
-    )
+    if input_dtype is None or input_dtype.itemsize == 2:
+        marlin_scales = marlin_scales.view(-1, 4)[:, [0, 2, 1, 3]].view(
+            marlin_scales.size(0), -1
+        )
+
     marlin_scales = marlin_scales.to(torch.float8_e8m0fnu)
+    if input_dtype == torch.float8_e4m3fn:
+        marlin_scales = marlin_scales.view(torch.uint8)
+        assert marlin_scales.max() <= 249
+        # exponent_bias (fp4->fp8) = 2 ** 3 - 2 ** 1 = 6
+        marlin_scales = marlin_scales + 6
+        marlin_scales = marlin_scales.view(torch.float8_e8m0fnu)
     return marlin_scales
 
 
@@ -99,6 +96,7 @@ def apply_fp4_marlin_linear(
     size_n: int,
     size_k: int,
     bias: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP4 hardware support, we can leverage the
@@ -111,12 +109,24 @@ def apply_fp4_marlin_linear(
         m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
     )
 
+    inputs = reshaped_x
+    a_scales = None
+    is_nvfp4 = weight_scale_2 is not None
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if is_nvfp4:
+            raise RuntimeError("NVFP4 weight + INT8/FP8 activation is not supported.")
+        elif input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
+        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+
     output = ops.gptq_marlin_gemm(
-        a=reshaped_x,
+        a=inputs,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
         b_scales=weight_scale,
+        a_scales=a_scales,
         global_scale=weight_scale_2,
         b_zeros=None,
         g_idx=None,
@@ -133,7 +143,9 @@ def apply_fp4_marlin_linear(
     return output.reshape(out_shape)
 
 
-def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_fp4_layer_for_marlin(
+    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
@@ -160,12 +172,14 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     perm = torch.empty(0, dtype=torch.int, device=device)
     qweight = layer.weight.view(torch.int32).T.contiguous()
 
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=qweight,
         perm=perm,
         size_k=part_size_k,
         size_n=part_size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
     layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
 
@@ -178,7 +192,11 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
 
     weight_scale = weight_scale.to(param_dtype)
     weight_scale = marlin_permute_scales(
-        s=weight_scale, size_k=part_size_k, size_n=part_size_n, group_size=group_size
+        s=weight_scale,
+        size_k=part_size_k,
+        size_n=part_size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
     if is_nvfp4:
@@ -189,7 +207,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         weight_scale_2 = nvfp4_marlin_process_global_scale(weight_scale_2)
         layer.weight_scale_2 = torch.nn.Parameter(weight_scale_2, requires_grad=False)
     else:
-        weight_scale = mxfp4_marlin_process_scales(weight_scale)
+        weight_scale = mxfp4_marlin_process_scales(
+            weight_scale, input_dtype=input_dtype
+        )
         layer.weight_scale = torch.nn.Parameter(weight_scale, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
@@ -200,7 +220,9 @@ def prepare_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     return
 
 
-def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
+def prepare_moe_fp4_layer_for_marlin(
+    layer: torch.nn.Module, input_dtype: torch.dtype | None = None
+) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP4 computation but "
         "FP4 quantization is being used. Weight-only FP4 compression will "
@@ -220,6 +242,7 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
     param_dtype = layer.params_dtype
     layer.workspace = marlin_make_workspace_new(device, 4)
     perm = torch.empty(0, dtype=torch.int, device=device)
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
 
     # WEIGHT
     # Repack weights to marlin format
@@ -237,7 +260,12 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             qweight = weight[i].view(torch.int32).T.contiguous()
 
             marlin_qweight = ops.gptq_marlin_repack(
-                b_q_weight=qweight, perm=perm, size_k=size_k, size_n=size_n, num_bits=4
+                b_q_weight=qweight,
+                perm=perm,
+                size_k=size_k,
+                size_n=size_n,
+                num_bits=4,
+                is_a_8bit=is_a_8bit,
             )
             tensor_list.append(marlin_qweight)
 
@@ -266,12 +294,18 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
             scale = scales[i].T
 
             marlin_scales = marlin_permute_scales(
-                s=scale, size_k=size_k, size_n=size_n, group_size=group_size
+                s=scale,
+                size_k=size_k,
+                size_n=size_n,
+                group_size=group_size,
+                is_a_8bit=is_a_8bit,
             )
             if is_nvfp4:
                 marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
             else:
-                marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+                marlin_scales = mxfp4_marlin_process_scales(
+                    marlin_scales, input_dtype=input_dtype
+                )
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
@@ -301,7 +335,10 @@ def prepare_moe_fp4_layer_for_marlin(layer: torch.nn.Module) -> None:
         setattr(layer, name, bias)
 
 
-def rand_marlin_weight_nvfp4_like(weight, group_size):
+def rand_marlin_weight_nvfp4_like(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+
+    assert not is_a_8bit, "NVFP4 weight + INT8/FP8 activation is not supported."
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
@@ -337,10 +374,15 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
         size_k=size_k,
         size_n=size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T.to(weight.dtype),
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
     marlin_scales = nvfp4_marlin_process_scales(marlin_scales)
 
@@ -349,14 +391,20 @@ def rand_marlin_weight_nvfp4_like(weight, group_size):
     return weight_ref.T, marlin_qweight, marlin_scales, global_scale
 
 
-def rand_marlin_weight_mxfp4_like(weight, group_size):
+def rand_marlin_weight_mxfp4_like(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+    if is_a_8bit:
+        assert input_dtype == torch.float8_e4m3fn, (
+            "MXFP4 weight + INT8 activation is not supported."
+        )
+
     assert group_size > 0
     size_n, size_k = weight.shape
     device = weight.device
 
     scales = torch.randint(
-        100,
-        125,
+        110,
+        120,
         (size_n, size_k // group_size),
         dtype=torch.uint8,
         device=weight.device,
@@ -380,18 +428,25 @@ def rand_marlin_weight_mxfp4_like(weight, group_size):
     ).view(size_n, size_k)
     weight_ref = weight_ref * scales.repeat_interleave(group_size, 1).to(weight.dtype)
 
+    perm = torch.empty(0, dtype=torch.int, device=device)
+    fp4_weight = fp4_weight.view(torch.int32).T.contiguous()
     marlin_qweight = ops.gptq_marlin_repack(
-        b_q_weight=fp4_weight.view(torch.int32).T.contiguous(),
-        perm=torch.empty(0, dtype=torch.int, device=device),
+        b_q_weight=fp4_weight,
+        perm=perm,
         size_k=size_k,
         size_n=size_n,
         num_bits=4,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T.to(weight.dtype), size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T.to(weight.dtype),
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
-    marlin_scales = mxfp4_marlin_process_scales(marlin_scales)
+    marlin_scales = mxfp4_marlin_process_scales(marlin_scales, input_dtype=input_dtype)
 
     return weight_ref.T, marlin_qweight, marlin_scales.to(torch.float8_e8m0fnu)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
index 8c96848a8539..e6b4f567caea 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -11,6 +11,7 @@
     marlin_make_workspace_new,
     marlin_permute_bias,
     marlin_permute_scales,
+    marlin_quant_input,
     should_use_atomic_add_reduce,
 )
 from vllm.platforms import current_platform
@@ -45,6 +46,7 @@ def apply_fp8_marlin_linear(
     size_n: int,
     size_k: int,
     bias: torch.Tensor | None,
+    input_dtype: torch.dtype | None = None,
     use_fp32_reduce: bool = USE_FP32_REDUCE_DEFAULT,
 ) -> torch.Tensor:
     # For GPUs that lack FP8 hardware support, we can leverage the
@@ -57,12 +59,21 @@ def apply_fp8_marlin_linear(
         m=reshaped_x.size(0), n=size_n, k=size_k, device=input.device, dtype=input.dtype
     )
 
+    inputs = reshaped_x
+    a_scales = None
+    if input_dtype is not None and input_dtype.itemsize == 1:
+        if input_dtype != torch.float8_e4m3fn:
+            raise RuntimeError("FP8 weight + INT8 activation is not supported.")
+
+        inputs, a_scales = marlin_quant_input(inputs, torch.float8_e4m3fn)
+
     output = ops.gptq_marlin_gemm(
         a=reshaped_x,
         c=None,
         b_q_weight=weight,
         b_bias=bias,
         b_scales=weight_scale,
+        a_scales=a_scales,
         global_scale=None,
         b_zeros=None,
         g_idx=None,
@@ -80,7 +91,9 @@ def apply_fp8_marlin_linear(
 
 
 def prepare_fp8_layer_for_marlin(
-    layer: torch.nn.Module, size_k_first: bool = True
+    layer: torch.nn.Module,
+    size_k_first: bool = True,
+    input_dtype: torch.dtype | None = None,
 ) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
@@ -162,7 +175,8 @@ def prepare_fp8_layer_for_marlin(
     marlin_scales = marlin_permute_scales(
         s=scales, size_k=part_size_k, size_n=part_size_n, group_size=group_size
     )
-    marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
+    if input_dtype != torch.float8_e4m3fn:
+        marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
     layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
 
     if hasattr(layer, "bias") and layer.bias is not None:
@@ -172,7 +186,9 @@ def prepare_fp8_layer_for_marlin(
 
 
 def prepare_moe_fp8_layer_for_marlin(
-    layer: torch.nn.Module, size_k_first: bool = True
+    layer: torch.nn.Module,
+    size_k_first: bool = True,
+    input_dtype: torch.dtype | None = None,
 ) -> None:
     logger.warning_once(
         "Your GPU does not have native support for FP8 computation but "
@@ -278,7 +294,8 @@ def prepare_moe_fp8_layer_for_marlin(
             tensor_list.append(marlin_scales)
 
         scales = torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
-        scales = fp8_fused_exponent_bias_into_scales(scales)
+        if input_dtype != torch.float8_e4m3fn:
+            scales = fp8_fused_exponent_bias_into_scales(scales)
         scales = torch.nn.Parameter(scales, requires_grad=False)
 
         setattr(layer, name + "_weight_scale", scales)
@@ -318,7 +335,11 @@ def pack_fp8_to_int32(
     return int32_tensor.T.contiguous() if size_k_first else int32_tensor
 
 
-def marlin_quant_fp8_torch(weight, group_size):
+def marlin_quant_fp8_torch(weight, group_size, input_dtype=None):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+    if is_a_8bit:
+        assert input_dtype == torch.float8_e4m3fn
+
     size_n, size_k = weight.shape
     device = weight.device
 
@@ -334,16 +355,22 @@ def marlin_quant_fp8_torch(weight, group_size):
         weight_ref = fp8_weight.to(weight.dtype) * repeated_scales
 
     packed_weight = pack_fp8_to_int32(fp8_weight, False).T.contiguous()
+    perm = torch.empty(0, dtype=torch.int, device=device)
     marlin_qweight = ops.gptq_marlin_repack(
         b_q_weight=packed_weight,
-        perm=torch.empty(0, dtype=torch.int, device=device),
+        perm=perm,
         size_k=size_k,
         size_n=size_n,
         num_bits=8,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = marlin_permute_scales(
-        s=scales.T, size_k=size_k, size_n=size_n, group_size=group_size
+        s=scales.T,
+        size_k=size_k,
+        size_n=size_n,
+        group_size=group_size,
+        is_a_8bit=is_a_8bit,
     )
 
     marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
index 89756c45ef55..9162afe03da9 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -5,7 +5,8 @@
 import numpy as np
 import torch
 
-from vllm.scalar_type import ScalarType
+from vllm import _custom_ops as ops
+from vllm.scalar_type import ScalarType, scalar_types
 
 from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales, marlin_zero_points
 from .quant_utils import (
@@ -29,13 +30,19 @@ def __init__(self, out_features, min_thread_n, max_parallel):
         self.scratch = torch.zeros(max_workspace_size, dtype=torch.int, device="cuda")
 
 
-def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+def marlin_permute_weights(
+    q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE, is_a_8bit=False
+):
     assert q_w.shape == (size_k, size_n)
     assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
     assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
 
-    # Permute weights to 16x64 marlin tiles
-    q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+    if is_a_8bit:
+        # Permute weights to 32x32 marlin tiles
+        q_w = q_w.reshape((size_k // (tile * 2), tile * 2, size_n // tile, tile))
+    else:
+        # Permute weights to 16x64 marlin tiles
+        q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
     q_w = q_w.permute((0, 2, 1, 3))
     q_w = q_w.reshape((size_k // tile, size_n * tile))
 
@@ -44,9 +51,9 @@ def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
     return q_w
 
 
-def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+def marlin_weights(q_w, size_k, size_n, num_bits, perm, is_a_8bit=False):
     # Permute
-    q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+    q_w = marlin_permute_weights(q_w, size_k, size_n, perm, is_a_8bit=is_a_8bit)
 
     # Pack
     pack_factor = get_pack_factor(num_bits)
@@ -63,28 +70,53 @@ def marlin_weights(q_w, size_k, size_n, num_bits, perm):
     return q_packed
 
 
-def get_weight_perm(num_bits: int):
+def get_weight_perm(num_bits: int, is_a_8bit: bool = False):
     perm_list: list[int] = []
-    for i in range(32):
-        perm1: list[int] = []
-        col = i // 4
-        for block in [0, 1]:
-            for row in [
-                2 * (i % 4),
-                2 * (i % 4) + 1,
-                2 * (i % 4 + 4),
-                2 * (i % 4 + 4) + 1,
-            ]:
-                perm1.append(16 * row + col + 8 * block)
-        for j in range(4):
-            perm_list.extend([p + 256 * j for p in perm1])
+    if is_a_8bit:
+        for i in range(32):
+            perm1 = []
+            col = i // 4
+            for block in [0, 1]:
+                for row in [
+                    4 * (i % 4),
+                    4 * (i % 4) + 1,
+                    4 * (i % 4) + 2,
+                    4 * (i % 4) + 3,
+                    4 * (i % 4 + 4),
+                    4 * (i % 4 + 4) + 1,
+                    4 * (i % 4 + 4) + 2,
+                    4 * (i % 4 + 4) + 3,
+                ]:
+                    perm1.append(16 * row + col + 8 * block)
+            for j in range(2):
+                perm_list.extend([p + 512 * j for p in perm1])
+    else:
+        for i in range(32):
+            perm1 = []
+            col = i // 4
+            for block in [0, 1]:
+                for row in [
+                    2 * (i % 4),
+                    2 * (i % 4) + 1,
+                    2 * (i % 4 + 4),
+                    2 * (i % 4 + 4) + 1,
+                ]:
+                    perm1.append(16 * row + col + 8 * block)
+            for j in range(4):
+                perm_list.extend([p + 256 * j for p in perm1])
 
     perm = np.array(perm_list)
 
     if num_bits == 4:
-        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+        if is_a_8bit:  # noqa: SIM108
+            interleave = np.array([0, 4, 1, 5, 2, 6, 3, 7])
+        else:
+            interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
     elif num_bits == 8:
-        interleave = np.array([0, 2, 1, 3])
+        if is_a_8bit:  # noqa: SIM108
+            interleave = np.array([0, 1, 2, 3])
+        else:
+            interleave = np.array([0, 2, 1, 3])
     else:
         raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
 
@@ -99,7 +131,10 @@ def marlin_quantize(
     group_size: int,
     act_order: bool,
     test_perm: torch.Tensor | None = None,
+    input_dtype: torch.dtype | None = None,
 ):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+
     size_k, size_n = w.shape
     num_bits = quant_type.size_bits
 
@@ -120,9 +155,15 @@ def marlin_quantize(
         q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
 
     # Reformat to marlin
-    weight_perm = get_weight_perm(num_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+    weight_perm = get_weight_perm(num_bits, is_a_8bit)
+    marlin_q_w = marlin_weights(
+        q_w, size_k, size_n, num_bits, weight_perm, is_a_8bit=is_a_8bit
+    )
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, is_a_8bit=is_a_8bit)
+
+    if input_dtype == torch.float8_e4m3fn and quant_type == scalar_types.uint4b8:
+        ops.marlin_int4_fp8_preprocess(marlin_q_w, inplace=True)
+        marlin_s = marlin_s * 512
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
@@ -132,7 +173,13 @@ def marlin_quantize(
     return res_list
 
 
-def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int):
+def awq_marlin_quantize(
+    w: torch.Tensor,
+    quant_type: ScalarType,
+    group_size: int,
+    input_dtype: torch.dtype | None = None,
+):
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
     size_k, size_n = w.shape
 
     # Normalize group_size
@@ -147,11 +194,22 @@ def awq_marlin_quantize(w: torch.Tensor, quant_type: ScalarType, group_size: int
     # Quantize with zp
     w_ref, q_w, s, zp = quantize_weights(w, quant_type, group_size, zero_points=True)
 
+    if input_dtype == torch.float8_e4m3fn and quant_type == scalar_types.uint4:
+        repeated_zp = zp.repeat_interleave(group_size, 0)
+        q_w_old = q_w
+        q_w = q_w_old - repeated_zp
+        q_w[q_w < 0] = 15 - q_w_old[q_w < 0]
+        s = s * 512
+
     # Reformat to marlin
-    weight_perm = get_weight_perm(quant_type.size_bits)
-    marlin_q_w = marlin_weights(q_w, size_k, size_n, quant_type.size_bits, weight_perm)
-    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
-    marlin_zp = marlin_zero_points(zp, num_groups, size_n, quant_type.size_bits)
+    weight_perm = get_weight_perm(quant_type.size_bits, is_a_8bit)
+    marlin_q_w = marlin_weights(
+        q_w, size_k, size_n, quant_type.size_bits, weight_perm, is_a_8bit=is_a_8bit
+    )
+    marlin_s = marlin_permute_scales(s, size_k, size_n, group_size, is_a_8bit=is_a_8bit)
+    marlin_zp = marlin_zero_points(
+        zp, num_groups, size_n, quant_type.size_bits, is_a_8bit=is_a_8bit
+    )
 
     # Create result
     res_list = [w_ref, marlin_q_w, marlin_s, marlin_zp]
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 34a31bcf6a74..d0c8b3d1a309 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -8,6 +8,7 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
+from vllm.utils.import_utils import has_triton_kernels
 from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer
 
 logger = init_logger(__name__)
@@ -15,6 +16,7 @@
 
 def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel"""
+    assert has_triton_kernels()
     import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
     from triton_kernels.numerics import InFlexData
     from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
@@ -37,15 +39,15 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
         value_layout = StridedLayout
         scale_layout = StridedLayout
     elif current_platform.is_rocm():
-        from triton_kernels.tensor_details.layout import (
-            GFX950MXScaleLayout,
-            StridedLayout,
-        )
-
         from vllm.platforms.rocm import on_gfx950
 
         value_layout = StridedLayout
-        scale_layout = GFX950MXScaleLayout if on_gfx950() else StridedLayout
+        if on_gfx950():
+            from triton_kernels.tensor_details.layout import GFX950MXScaleLayout
+
+            scale_layout = GFX950MXScaleLayout
+        else:
+            scale_layout = StridedLayout
     else:
         value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
             mx_axis=1
diff --git a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
index c3f26cc77411..44c5b027daf4 100644
--- a/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
+++ b/vllm/model_executor/layers/quantization/utils/nvfp4_moe_support.py
@@ -5,6 +5,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
+    is_flashinfer_fp4_cutedsl_moe_available,
     is_flashinfer_fp4_cutlass_moe_available,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
@@ -32,7 +33,10 @@ def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
     """Detect platform support for NV-FP4 fused-MoE path"""
     cutlass_supported = cutlass_fp4_supported()
 
-    allow_flashinfer = cutlass_supported and is_flashinfer_fp4_cutlass_moe_available()
+    allow_flashinfer = cutlass_supported and (
+        is_flashinfer_fp4_cutlass_moe_available()
+        or is_flashinfer_fp4_cutedsl_moe_available()
+    )
 
     if allow_flashinfer:
         _logger.info_once(
diff --git a/vllm/model_executor/layers/rotary_embedding/__init__.py b/vllm/model_executor/layers/rotary_embedding/__init__.py
index 56c165f9c041..0f10bff6ac4f 100644
--- a/vllm/model_executor/layers/rotary_embedding/__init__.py
+++ b/vllm/model_executor/layers/rotary_embedding/__init__.py
@@ -17,6 +17,7 @@
 from .mrope import MRotaryEmbedding
 from .ntk_scaling_rope import NTKScalingRotaryEmbedding
 from .phi3_long_rope_scaled_rope import Phi3LongRoPEScaledRotaryEmbedding
+from .xdrope import XDRotaryEmbedding
 from .yarn_scaling_rope import YaRNScalingRotaryEmbedding
 
 _ROPE_DICT: dict[tuple, RotaryEmbedding] = {}
@@ -26,23 +27,23 @@ def get_rope(
     head_size: int,
     rotary_dim: int,
     max_position: int,
-    base: float,
     is_neox_style: bool = True,
-    rope_scaling: dict[str, Any] | None = None,
+    rope_parameters: dict[str, Any] | None = None,
     dtype: torch.dtype | None = None,
     partial_rotary_factor: float = 1.0,
     dual_chunk_attention_config: dict[str, Any] | None = None,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
-    if rope_scaling is not None:
+    if rope_parameters is not None:
         # Transforms every value that is a list into a tuple for caching calls
-        rope_scaling_tuple = {
-            k: tuple(v) if isinstance(v, list) else v for k, v in rope_scaling.items()
+        rope_parameters_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in rope_parameters.items()
         }
-        rope_scaling_args = tuple(rope_scaling_tuple.items())
+        rope_parameters_args = tuple(rope_parameters_tuple.items())
     else:
-        rope_scaling_args = None
+        rope_parameters_args = None
 
     if dual_chunk_attention_config is not None:
         dual_chunk_attention_tuple = {
@@ -60,15 +61,15 @@ def get_rope(
         head_size,
         rotary_dim,
         max_position,
-        base,
         is_neox_style,
-        rope_scaling_args,
+        rope_parameters_args,
         dual_chunk_attention_args,
         dtype,
     )
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
 
+    base = rope_parameters["rope_theta"] if rope_parameters else 10000
     if dual_chunk_attention_config is not None:
         extra_kwargs = {
             k: v
@@ -84,18 +85,18 @@ def get_rope(
             dtype,
             **extra_kwargs,
         )
-    elif not rope_scaling:
+    elif not rope_parameters:
         rotary_emb = RotaryEmbedding(
             head_size, rotary_dim, max_position, base, is_neox_style, dtype
         )
     else:
-        scaling_type = rope_scaling["rope_type"]
+        scaling_type = rope_parameters["rope_type"]
 
         if scaling_type == "llama3":
-            scaling_factor = rope_scaling["factor"]
-            low_freq_factor = rope_scaling["low_freq_factor"]
-            high_freq_factor = rope_scaling["high_freq_factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            low_freq_factor = rope_parameters["low_freq_factor"]
+            high_freq_factor = rope_parameters["high_freq_factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             rotary_emb = Llama3RotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -113,7 +114,7 @@ def get_rope(
                 head_size, rotary_dim, max_position, base, is_neox_style, dtype
             )
         elif scaling_type == "default":
-            if "mrope_section" in rope_scaling:
+            if "mrope_section" in rope_parameters:
                 rotary_emb = MRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -121,8 +122,8 @@ def get_rope(
                     base,
                     is_neox_style,
                     dtype,
-                    mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                    mrope_section=rope_parameters["mrope_section"],
+                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
                 )
             else:
                 rotary_emb = RotaryEmbedding(
@@ -134,7 +135,7 @@ def get_rope(
                     dtype,
                 )
         elif scaling_type == "linear":
-            scaling_factor = rope_scaling["factor"]
+            scaling_factor = rope_parameters["factor"]
             rotary_emb = LinearScalingRotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -145,8 +146,8 @@ def get_rope(
                 dtype,
             )
         elif scaling_type == "ntk":
-            scaling_factor = rope_scaling["factor"]
-            mixed_b = rope_scaling.get("mixed_b", None)
+            scaling_factor = rope_parameters["factor"]
+            mixed_b = rope_parameters.get("mixed_b")
             rotary_emb = NTKScalingRotaryEmbedding(
                 head_size,
                 rotary_dim,
@@ -158,8 +159,8 @@ def get_rope(
                 mixed_b,
             )
         elif scaling_type == "dynamic":
-            if "alpha" in rope_scaling:
-                scaling_alpha = rope_scaling["alpha"]
+            if "alpha" in rope_parameters:
+                scaling_alpha = rope_parameters["alpha"]
                 rotary_emb = DynamicNTKAlphaRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -169,8 +170,8 @@ def get_rope(
                     scaling_alpha,
                     dtype,
                 )
-            elif "factor" in rope_scaling:
-                scaling_factor = rope_scaling["factor"]
+            elif "factor" in rope_parameters:
+                scaling_factor = rope_parameters["factor"]
                 rotary_emb = DynamicNTKScalingRotaryEmbedding(
                     head_size,
                     rotary_dim,
@@ -184,12 +185,24 @@ def get_rope(
                 raise ValueError(
                     "Dynamic rope scaling must contain either 'alpha' or 'factor' field"
                 )
+        elif scaling_type == "xdrope":
+            scaling_alpha = rope_parameters["alpha"]
+            rotary_emb = XDRotaryEmbedding(
+                head_size,
+                rotary_dim,
+                max_position,
+                base,
+                is_neox_style,
+                scaling_alpha,
+                dtype,
+                xdrope_section=rope_parameters["xdrope_section"],
+            )
         elif scaling_type == "yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k
                 in (
                     "extrapolation_factor",
@@ -197,9 +210,10 @@ def get_rope(
                     "beta_fast",
                     "beta_slow",
                     "apply_yarn_scaling",
+                    "truncate",
                 )
             }
-            if "mrope_section" in rope_scaling:
+            if "mrope_section" in rope_parameters:
                 extra_kwargs.pop("apply_yarn_scaling", None)
                 rotary_emb = MRotaryEmbedding(
                     head_size,
@@ -208,8 +222,8 @@ def get_rope(
                     base,
                     is_neox_style,
                     dtype,
-                    mrope_section=rope_scaling["mrope_section"],
-                    mrope_interleaved=rope_scaling.get("mrope_interleaved", False),
+                    mrope_section=rope_parameters["mrope_section"],
+                    mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
                     scaling_factor=scaling_factor,
                     **extra_kwargs,
                 )
@@ -225,12 +239,12 @@ def get_rope(
                     **extra_kwargs,
                 )
         elif scaling_type == "deepseek_yarn":
-            scaling_factor = rope_scaling["factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            scaling_factor = rope_parameters["factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             # assert max_position == original_max_position * scaling_factor
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k
                 in (
                     "extrapolation_factor",
@@ -252,12 +266,12 @@ def get_rope(
                 **extra_kwargs,
             )
         elif scaling_type == "longrope":
-            short_factor = rope_scaling["short_factor"]
-            long_factor = rope_scaling["long_factor"]
-            original_max_position = rope_scaling["original_max_position_embeddings"]
+            short_factor = rope_parameters["short_factor"]
+            long_factor = rope_parameters["long_factor"]
+            original_max_position = rope_parameters["original_max_position_embeddings"]
             extra_kwargs = {
                 k: v
-                for k, v in rope_scaling.items()
+                for k, v in rope_parameters.items()
                 if k in ("short_mscale", "long_mscale")
             }
             rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
index ce4f40680b0a..4114b21168cc 100644
--- a/vllm/model_executor/layers/rotary_embedding/base.py
+++ b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -83,6 +83,11 @@ def _match_cos_sin_cache_dtype(self, query: torch.Tensor) -> None:
         ):
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
 
+    def get_cos_sin(self, seqlen: int) -> tuple[torch.Tensor, torch.Tensor]:
+        cos_sin = self.cos_sin_cache[:seqlen]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        return cos, sin
+
 
 class RotaryEmbedding(RotaryEmbeddingBase):
     def __init__(
diff --git a/vllm/model_executor/layers/rotary_embedding/common.py b/vllm/model_executor/layers/rotary_embedding/common.py
index 196533b61795..13f8d15cc0f7 100644
--- a/vllm/model_executor/layers/rotary_embedding/common.py
+++ b/vllm/model_executor/layers/rotary_embedding/common.py
@@ -117,13 +117,13 @@ def yarn_find_correction_range(
     dim: int,
     base: float = 10000,
     max_position_embeddings: int = 2048,
-) -> tuple[int, int]:
-    low = math.floor(
-        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
-    )
-    high = math.ceil(
-        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
-    )
+    truncate: bool = True,
+) -> tuple[float | int, float | int]:
+    low = yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    high = yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    if truncate:
+        low = math.floor(low)
+        high = math.ceil(high)
     return max(low, 0), min(high, dim - 1)  # Clamp values just in case
 
 
diff --git a/vllm/model_executor/layers/rotary_embedding/xdrope.py b/vllm/model_executor/layers/rotary_embedding/xdrope.py
new file mode 100644
index 000000000000..2432273faf19
--- /dev/null
+++ b/vllm/model_executor/layers/rotary_embedding/xdrope.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import numpy as np
+import torch
+
+from .common import apply_rotary_emb_dispatch
+from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
+
+
+class XDRotaryEmbedding(DynamicNTKAlphaRotaryEmbedding):
+    """DynamicNTKAlphaRotaryEmbedding extended with MultiModal(XD) Sections.
+
+    Based on the original DynamicNTKAlphaRotaryEmbedding implementation.
+    """
+
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: float,
+        is_neox_style: bool,
+        scaling_alpha: float,
+        dtype: torch.dtype,
+        xdrope_section: list[int],
+    ) -> None:
+        self.xdrope_section = xdrope_section
+        super().__init__(
+            head_size,
+            rotary_dim,
+            max_position_embeddings,
+            base,
+            is_neox_style,
+            scaling_alpha,
+            dtype,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor | None = None,
+        offsets: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """PyTorch-native implementation equivalent to forward().
+
+        Args:
+            positions:
+                [4, num_tokens] (P/W/H/T positions with multimodal inputs)
+            query: [num_tokens, num_heads * head_size]
+            key: [num_tokens, num_kv_heads * head_size]
+        """
+        assert positions.ndim == 2
+        assert key is not None
+
+        num_tokens = positions.shape[-1]
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        cos = torch.cat(
+            [m[i] for i, m in enumerate(cos.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+        sin = torch.cat(
+            [m[i] for i, m in enumerate(sin.split(self.xdrope_section, dim=-1))], dim=-1
+        )
+
+        query_shape = query.shape
+        query = query.view(num_tokens, -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        query_pass = query[..., self.rotary_dim :]
+        query_rot = apply_rotary_emb_dispatch(query_rot, cos, sin, self.is_neox_style)
+        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
+
+        key_shape = key.shape
+        key = key.view(num_tokens, -1, self.head_size)
+        key_rot = key[..., : self.rotary_dim]
+        key_pass = key[..., self.rotary_dim :]
+        key_rot = apply_rotary_emb_dispatch(key_rot, cos, sin, self.is_neox_style)
+        key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
+        return query, key
+
+    @staticmethod
+    def get_next_input_positions(
+        context_len: int,
+        seq_len: int,
+        xd_sections: int = 4,
+    ) -> list[list[int]]:
+        return [list(range(context_len, seq_len)) for _ in range(xd_sections)]
+
+    @staticmethod
+    def get_next_input_positions_tensor(
+        out: np.ndarray,
+        out_offset: int,
+        context_len: int,
+        num_new_tokens: int,
+    ):
+        values = np.arange(
+            context_len,
+            context_len + num_new_tokens,
+            dtype=out.dtype,
+        )
+        out[:, out_offset : out_offset + num_new_tokens] = values
diff --git a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
index ff46ad74b302..f01ca1e23121 100644
--- a/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
+++ b/vllm/model_executor/layers/rotary_embedding/yarn_scaling_rope.py
@@ -28,12 +28,14 @@ def __init__(
         beta_fast: int = 32,
         beta_slow: int = 1,
         apply_yarn_scaling: bool = True,
+        truncate: bool = True,
     ) -> None:
         self.scaling_factor = scaling_factor
         self.extrapolation_factor = extrapolation_factor
         self.attn_factor = attn_factor
         self.beta_fast = beta_fast
         self.beta_slow = beta_slow
+        self.truncate = truncate
         # Get n-d magnitude scaling corrected for interpolation
         self.mscale = (
             float(yarn_get_mscale(self.scaling_factor) * attn_factor)
@@ -57,6 +59,7 @@ def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
             self.rotary_dim,
             self.base,
             self.max_position_embeddings,
+            self.truncate,
         )
         # Get n-d rotational scaling corrected for extrapolation
         inv_freq_mask = (
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 301f2d00bf40..052d2cfc1099 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -30,6 +30,7 @@
 # if a new load format is added here
 LoadFormats = Literal[
     "auto",
+    "hf",
     "bitsandbytes",
     "dummy",
     "fastsafetensors",
@@ -45,6 +46,7 @@
 ]
 _LOAD_FORMAT_TO_MODEL_LOADER: dict[str, type[BaseModelLoader]] = {
     "auto": DefaultModelLoader,
+    "hf": DefaultModelLoader,
     "bitsandbytes": BitsAndBytesModelLoader,
     "dummy": DummyModelLoader,
     "fastsafetensors": DefaultModelLoader,
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index c06ac550a94a..88c6d1e27e39 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -22,6 +22,7 @@
     fastsafetensors_weights_iterator,
     filter_duplicate_safetensors_files,
     filter_files_not_needed_for_inference,
+    get_quant_config,
     maybe_download_from_modelscope,
     multi_thread_pt_weights_iterator,
     multi_thread_safetensors_weights_iterator,
@@ -30,6 +31,7 @@
     safetensors_weights_iterator,
 )
 from vllm.platforms import current_platform
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
 
 logger = init_logger(__name__)
 
@@ -95,8 +97,25 @@ def _prepare_weights(
         load_format = self.load_config.load_format
         use_safetensors = False
         index_file = SAFE_WEIGHTS_INDEX_NAME
-        # Some quantized models use .pt files for storing the weights.
+
+        # First check for 'auto' format that mistral files format are present.
+        # This is to load mistral models with official format by default.
         if load_format == "auto":
+            load_format = (
+                "mistral"
+                if len(
+                    list_filtered_repo_files(
+                        model_name_or_path=model_name_or_path,
+                        allow_patterns=["consolidated*.safetensors"],
+                        revision=revision,
+                    )
+                )
+                > 0
+                else "hf"
+            )
+
+        # Some quantized models use .pt files for storing the weights.
+        if load_format == "hf":
             allow_patterns = ["*.safetensors", "*.bin"]
         elif load_format == "safetensors" or load_format == "fastsafetensors":
             use_safetensors = True
@@ -273,42 +292,17 @@ def download_model(self, model_config: ModelConfig) -> None:
         )
 
     def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        if model_config.quantization == "torchao" and torchao_version_at_least(
-            "0.14.0"
-        ):
-            self.load_config.safetensors_load_strategy = "torchao"
-        weights_to_load = {name for name, _ in model.named_parameters()}
+        if model_config.quantization == "torchao":
+            quant_config = get_quant_config(model_config, self.load_config)
+            if (
+                hasattr(quant_config, "is_checkpoint_torchao_serialized")
+                and quant_config.is_checkpoint_torchao_serialized
+                and torchao_version_at_least("0.15.0")
+            ):
+                self.load_config.safetensors_load_strategy = "torchao"
 
-        # if we don't have `model.weight_metadata_and_attr_saved` defined and
-        # set to True, it means that this is either offline quantization case
-        # or the first run of online quantization
-        # see online_quantization.py for detailed notes
-        offline_quantization_or_first_run_of_online_quantization = not getattr(
-            model, "weight_metadata_and_attr_saved", False
-        )
-
-        if model_config.quantization is None:
-            # model is not quantized
-            loaded_weights = model.load_weights(
-                self.get_all_weights(model_config, model)
-            )
-        elif offline_quantization_or_first_run_of_online_quantization:
-            # case 1: offline quantized checkpoint
-            # case 2: Step I1 first run of weight loading with
-            # online quantization
-            # see online_quantization.py for detailed notes
-            loaded_weights = model.load_weights(
-                self.get_all_weights(model_config, model)
-            )
-        else:
-            # to avoid circular dependency
-            from vllm.model_executor.model_loader.online_quantization import (
-                load_weights_and_online_quantize,
-            )
-
-            # subsequent runs of weight loading with online
-            # quantization
-            loaded_weights = load_weights_and_online_quantize(self, model, model_config)
+        weights_to_load = {name for name, _ in model.named_parameters()}
+        loaded_weights = model.load_weights(self.get_all_weights(model_config, model))
 
         self.counter_after_loading_weights = time.perf_counter()
         logger.info_once(
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 7db1fc167c4f..74052f72ceab 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -7,22 +7,27 @@
 import torch
 import torch.nn as nn
 from huggingface_hub import hf_hub_download
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
 
 from vllm.config import ModelConfig, VllmConfig
 from vllm.config.load import LoadConfig
+from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (
     initialize_model,
     process_weights_after_loading,
 )
 from vllm.model_executor.model_loader.weight_utils import (
+    download_gguf,
     get_gguf_extra_tensor_names,
     get_gguf_weight_type_map,
     gguf_quant_weights_iterator,
 )
+from vllm.transformers_utils.gguf_utils import detect_gguf_multimodal
 from vllm.utils.torch_utils import set_default_torch_dtype
 
+logger = init_logger(__name__)
+
 
 class GGUFModelLoader(BaseModelLoader):
     """
@@ -39,7 +44,8 @@ def __init__(self, load_config: LoadConfig):
                 f"load format {load_config.load_format}"
             )
 
-    def _prepare_weights(self, model_name_or_path: str):
+    def _prepare_weights(self, model_config: ModelConfig):
+        model_name_or_path = model_config.model
         if os.path.isfile(model_name_or_path):
             return model_name_or_path
         # for raw HTTPS link
@@ -51,12 +57,23 @@ def _prepare_weights(self, model_name_or_path: str):
         if "/" in model_name_or_path and model_name_or_path.endswith(".gguf"):
             repo_id, filename = model_name_or_path.rsplit("/", 1)
             return hf_hub_download(repo_id=repo_id, filename=filename)
-        else:
-            raise ValueError(
-                f"Unrecognised GGUF reference: {model_name_or_path} "
-                "(expected local file, raw URL, or <repo_id>/<filename>.gguf)"
+        # repo_id:quant_type
+        elif "/" in model_name_or_path and ":" in model_name_or_path:
+            repo_id, quant_type = model_name_or_path.rsplit(":", 1)
+            return download_gguf(
+                repo_id,
+                quant_type,
+                cache_dir=self.load_config.download_dir,
+                revision=model_config.revision,
+                ignore_patterns=self.load_config.ignore_patterns,
             )
 
+        raise ValueError(
+            f"Unrecognised GGUF reference: {model_name_or_path} "
+            "(expected local file, raw URL, <repo_id>/<filename>.gguf, "
+            "or <repo_id>:<quant_type>)"
+        )
+
     def _get_gguf_weights_map(self, model_config: ModelConfig):
         """
         GGUF uses this naming convention for their tensors from HF checkpoint:
@@ -67,7 +84,15 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
         https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details.
         """
         config = model_config.hf_config
+        # Get text config to handle both nested (multimodal) and flat
+        # (text-only) config structures. For multimodal models like
+        # Gemma3Config, this returns config.text_config. For text-only
+        # models, this returns config itself.
+        text_config = config.get_text_config()
         model_type = config.model_type
+        is_multimodal = (
+            hasattr(config, "vision_config") and config.vision_config is not None
+        )
         gguf_to_hf_name_map = {}
         # hack: ggufs have a different name than transformers
         if model_type == "cohere":
@@ -115,40 +140,183 @@ def _get_gguf_weights_map(self, model_config: ModelConfig):
                 break
         if arch is None:
             raise RuntimeError(f"Unknown gguf model_type: {model_type}")
-        num_layers = config.num_hidden_layers
-        name_map = gguf.get_tensor_name_map(arch, num_layers)
+        text_num_layers = text_config.num_hidden_layers
+        text_name_map = gguf.get_tensor_name_map(arch, text_num_layers)
+
+        if is_multimodal:
+            mm_proj_arch = gguf.MODEL_ARCH.MMPROJ
+            vision_num_layers = config.vision_config.num_hidden_layers
+            vision_name_map = gguf.get_tensor_name_map(mm_proj_arch, vision_num_layers)
+        else:
+            vision_name_map = None
+
+        # Create dummy model to extract parameter names
+        # For multimodal: use AutoModelForImageTextToText to get
+        # language + vision + projector params
+        # For text-only: use AutoModelForCausalLM to get language model params
+        auto_cls = (
+            AutoModelForImageTextToText if is_multimodal else AutoModelForCausalLM
+        )
         with torch.device("meta"):
-            dummy_model = AutoModelForCausalLM.from_config(
+            dummy_model = auto_cls.from_config(
                 config, trust_remote_code=model_config.trust_remote_code
             )
+
         state_dict = dummy_model.state_dict()
+        if hf_checkpoint_map := getattr(
+            dummy_model, "_checkpoint_conversion_mapping", None
+        ):
+
+            def revert_hf_rename(name: str) -> str:
+                for original_name, hf_name in hf_checkpoint_map.items():
+                    if hf_name in name:
+                        name = name.replace(hf_name, original_name).lstrip("^")
+                return name
+
+            state_dict = {
+                revert_hf_rename(name): tensor for name, tensor in state_dict.items()
+            }
+
+        def find_hf_name_in_tensor_map(hf_name: str) -> str | None:
+            """
+            Map HuggingFace parameter name to GGUF tensor name.
+
+            This function handles the mismatch between HF parameter naming
+            conventions and gguf-py's expected format:
+            1. Strips 'model.' prefix (common in multimodal models)
+            2. Converts '_weight' suffix to '.weight' (Gemma3 compatibility)
+            3. Searches vision_name_map for multimodal parameters
+            4. Falls back to text_name_map for language model parameters
+
+            Args:
+                hf_name: Full HuggingFace parameter name (e.g.,
+                        'model.multi_modal_projector.mm_soft_emb_norm.weight')
+
+            Returns:
+                GGUF tensor name with suffix (e.g., 'mm.soft_emb_norm.weight')
+                or None if no mapping found
+            """
+            # Strip 'language_model.' prefix for multimodal models - gguf-py
+            # tensor mappings expect parameter names without this prefix.
+            # Note: 'model.' prefix should be KEPT for text-only models as
+            # gguf-py expects it.
+            if hf_name.startswith("language_model."):
+                hf_name = hf_name[15:]  # Remove 'language_model.'
+
+            # Parse parameter name and suffix
+            if hf_name.endswith((".weight", ".bias")):
+                base_name, suffix = hf_name.rsplit(".", 1)
+            else:
+                base_name, suffix = hf_name, ""
+                # Handle '_weight' suffix (Gemma3 naming: parameter ends with
+                # '_weight' instead of '.weight')
+                if base_name.endswith("_weight"):
+                    base_name = base_name[:-7]  # Remove '_weight'
+                    suffix = "weight"
+
+            gguf_name = None
+            # Priority 1: Search vision/projector parameters for multimodal models
+            if vision_name_map is not None:
+                gguf_name = vision_name_map.get_name(base_name)
 
+            # Priority 2: Search text backbone parameters
+            if gguf_name is None:
+                gguf_name = text_name_map.get_name(base_name)
+
+            if gguf_name is None:
+                return None
+
+            return gguf_name + "." + suffix
+
+        # Build mapping and track unmapped parameters
+        unmapped_params = []
         for hf_name in state_dict:
-            name, suffix = hf_name.rsplit(".", 1)
-            gguf_name = name_map.get_name(name)
-            gguf_to_hf_name_map[f"{gguf_name}.{suffix}"] = hf_name
+            gguf_name_with_suffix = find_hf_name_in_tensor_map(hf_name)
+
+            # Track mapping success
+            if gguf_name_with_suffix is not None:
+                gguf_to_hf_name_map[gguf_name_with_suffix] = hf_name
+                logger.debug("Mapped GGUF %s → HF %s", gguf_name_with_suffix, hf_name)
+            elif hf_name not in gguf_to_hf_name_map.values():
+                # Parameter not in manual overrides either
+                unmapped_params.append(hf_name)
+
+        # All parameters must be mapped: both vision/projector and backbone
+        if unmapped_params:
+            raise RuntimeError(
+                f"Failed to map GGUF parameters "
+                f"({len(unmapped_params)}): "
+                f"{unmapped_params}"
+            )
         return gguf_to_hf_name_map
 
+    def _get_gguf_weight_type(
+        self,
+        model_config: ModelConfig,
+        model_name_or_path: str,
+        gguf_to_hf_name_map: dict[str, str],
+    ) -> dict[str, str]:
+        weight_type_map = get_gguf_weight_type_map(
+            model_name_or_path, gguf_to_hf_name_map
+        )
+        is_multimodal = hasattr(model_config.hf_config, "vision_config")
+        if is_multimodal:
+            mmproj_file = detect_gguf_multimodal(model_name_or_path)
+            assert mmproj_file is not None, (
+                "Could not find mm_proj file for multimodal GGUF model"
+            )
+            logger.info("Loading extra mm_proj weights from %s...", mmproj_file)
+            mm_proj_weight_type_map = get_gguf_weight_type_map(
+                mmproj_file, gguf_to_hf_name_map
+            )
+            weight_type_map.update(mm_proj_weight_type_map)
+        return weight_type_map
+
     def _get_weights_iterator(
-        self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str]
+        self,
+        model_config: ModelConfig,
+        model_name_or_path: str,
+        gguf_to_hf_name_map: dict[str, str],
     ) -> Generator[tuple[str, torch.Tensor], None, None]:
-        return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
+        """
+        Iterate over GGUF model weights, loading from both main model file and
+        mmproj.gguf for multimodal Gemma3 models.
+
+        For Gemma3 multimodal GGUF models:
+        - Main file (gemma-3-*.gguf): Language model weights (model.*)
+        - mmproj file (mmproj*.gguf): Vision tower + projector weights (v.*, mm.*)
+
+        Yields:
+            Tuples of (parameter_name, tensor) for all model weights
+        """
+        hf_config = model_config.hf_config
+        is_multimodal = hasattr(hf_config, "vision_config")
+
+        if is_multimodal:
+            # Load mm_proj (mm_encoder + projector) for multimodal weights
+            mmproj_file = detect_gguf_multimodal(model_name_or_path)
+            assert mmproj_file is not None, (
+                "Could not find mm_proj file for multimodal GGUF model"
+            )
+            yield from gguf_quant_weights_iterator(mmproj_file, gguf_to_hf_name_map)
+
+        yield from gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map)
 
     def download_model(self, model_config: ModelConfig) -> None:
-        self._prepare_weights(model_config.model)
+        self._prepare_weights(model_config)
 
     def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         model.load_weights(
-            self._get_weights_iterator(local_model_path, gguf_weights_map)
+            self._get_weights_iterator(model_config, local_model_path, gguf_weights_map)
         )
 
     def load_model(
         self, vllm_config: VllmConfig, model_config: ModelConfig
     ) -> nn.Module:
         device_config = vllm_config.device_config
-        local_model_path = self._prepare_weights(model_config.model)
+        local_model_path = self._prepare_weights(model_config)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         # we can only know if tie word embeddings after mapping weights
         if "lm_head.weight" in get_gguf_extra_tensor_names(
@@ -156,14 +324,19 @@ def load_model(
         ):
             model_config.hf_config.update({"tie_word_embeddings": True})
 
-        weight_type_map = get_gguf_weight_type_map(model_config.model, gguf_weights_map)
-
+        weight_type_map = self._get_gguf_weight_type(
+            model_config, local_model_path, gguf_weights_map
+        )
         # filter out unquantized modules to skip
         unquant_names = [
             name.removesuffix(".weight")
             for name, weight_type in weight_type_map.items()
-            if weight_type == "F32" and name.endswith(".weight")
+            if weight_type in ("F32", "F16", "BF16") and name.endswith(".weight")
         ]
+        logger.debug(
+            "GGUF unquantized modules: %s",
+            unquant_names,
+        )
         vllm_config.quant_config.unquantized_modules.extend(unquant_names)
 
         target_device = torch.device(device_config.device)
diff --git a/vllm/model_executor/model_loader/online_quantization.py b/vllm/model_executor/model_loader/online_quantization.py
index 890dd7231a0e..f330af85bbe8 100644
--- a/vllm/model_executor/model_loader/online_quantization.py
+++ b/vllm/model_executor/model_loader/online_quantization.py
@@ -2,13 +2,13 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import types
+from collections.abc import Iterable
 
 import torch
 from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader.default_loader import DefaultModelLoader
 from vllm.model_executor.model_loader.utils import process_weights_after_loading
 
 logger = init_logger(__name__)
@@ -56,6 +56,9 @@
 #    R4. quantize weights (by calling process_weights_after_loading),
 #    also set `process_weights_after_loading_already_called` to
 #    True to stop it from running again
+#    R5. (workaround for cudagraph), we restore the weight params to original quantized
+#    weights params, and use original_weight_param.copy_(updated_weight_param) so that
+#    the weight update work well with cudagraph
 #  process_weights_after_loading (if called):
 #    this will be skipped since it's already ran in
 #    load_weights
@@ -69,14 +72,6 @@ def maybe_save_metadata_and_attributes_for_weight_reloading(
     if model_config.quantization != "torchao":
         return
 
-    if getattr(model, "process_weights_after_loading_already_called", False):
-        # In case `process_weights_after_loading` is called multiple times
-        # we'll skip it at later times
-        logger.warning(
-            "process_weights_after_loading already called for model %s", model
-        )
-        return
-
     from vllm.model_executor.model_loader.weight_utils import get_quant_config
 
     quant_config = get_quant_config(model_config, None)
@@ -137,6 +132,7 @@ def maybe_save_metadata_and_attributes_for_weight_reloading(
                 else:
                     model.recorded_weight_attr[name][key] = attr
     # mark the metadata and attributes saved so we don't run it again
+    model._model_config = model_config
     model.weight_metadata_and_attr_saved = True
 
 
@@ -148,77 +144,132 @@ def _bond_method_to_cls(func, obj):
         return types.MethodType(func, obj)
 
 
-def load_weights_and_online_quantize(
-    model_loader: DefaultModelLoader, model: nn.Module, model_config: ModelConfig
-) -> set[str]:
+def support_quantized_model_reload_from_hp_weights(original_load_weights):
+    """Decorator for `load_weights` method for AutoWeightsLoader.load_weights to support
+    reloading high precision (bfloat16/float16/float32) weight for an already quantized
+    model, this involves restoring the weights to a high precision weights and
+    then online quantize the weights
+    """
     # online quantization, right now only enabled for
     # torchao
-    # R1, R2, R3, R4 in the Notes
-
-    # TODO: Add fp8 support
-    assert model_config.quantization == "torchao", (
-        "online quantization is only enabled for torchao currently"
-    )
-    # TODO: use create_weights to restore the weights to original state
-
-    # Step R1: First restore the quantized weights to original bfloat16
-    # weights, with original metadata (shape, dtype, device)
-    # and attributes, so that bfloat16 weights can be loaded properly
-    existing_param_names = dict(model.named_parameters(remove_duplicate=False)).keys()
-    named_modules = dict(model.named_modules(remove_duplicate=False))
-    model_device = None
-
-    # Step R2: recover the parameter to the state before first loading
-    for name, d in model.original_weights_rebuild_keys.items():
-        _shape = d["shape"]
-        _dtype = d["dtype"]
-        _device = d["device"]
+    # R1, R2, R3, R4, R5 in the Notes
+
+    def patched_model_load_weights(
+        auto_weight_loader, weights: Iterable[tuple[str, torch.Tensor]], *, mapper=None
+    ) -> set[str]:
+        model = auto_weight_loader.module
+        offline_quantization_or_first_run_of_online_quantization = not getattr(
+            model, "weight_metadata_and_attr_saved", False
+        )
+
+        # if we don't have `model.weight_metadata_and_attr_saved` defined and
+        # set to True, it means that this is either offline quantization case
+        # or the first run of online quantization
+        # see Notes in this file for more details
+        if offline_quantization_or_first_run_of_online_quantization:
+            # case 1: offline quantized checkpoint
+            # case 2: Step I1 first run of weight loading with
+            # online quantization
+            return original_load_weights(auto_weight_loader, weights, mapper=mapper)
+
+        model_config = model._model_config
+
+        # TODO: Add fp8 support
+        assert model_config.quantization == "torchao", (
+            "online quantization is only enabled for torchao currently"
+        )
+        # TODO: use create_weights to restore the weights to original state
+
+        # Step R1: First restore the quantized weights to original bfloat16
+        # weights, with original metadata (shape, dtype, device)
+        # and attributes, so that bfloat16 weights can be loaded properly
+        # TODO: maybe set remove_duplicate to True?
+        original_quantized_weight_dict = dict(
+            model.named_parameters(remove_duplicate=False)
+        )
+        named_modules = dict(model.named_modules(remove_duplicate=False))
+        model_device = None
+
+        for name, d in model.original_weights_rebuild_keys.items():
+            _shape = d["shape"]
+            _dtype = d["dtype"]
+            _device = d["device"]
+            if model_device is not None:
+                assert model_device == _device, (
+                    "Expecting all weights "
+                    "to be in the same device for now, got both: "
+                    f"{model_device} and {_device}"
+                )
+            else:
+                model_device = _device
+
+            if name in original_quantized_weight_dict:
+                module_name, weight_name = name.rsplit(".", 1)
+                module = named_modules[module_name]
+                setattr(
+                    module,
+                    weight_name,
+                    torch.nn.Parameter(
+                        torch.empty(_shape, dtype=_dtype, device=_device),
+                        requires_grad=False,
+                    ),
+                )
+
+        # Step R2: recover the weight attributes to the state before first loading
+        # recorded_weight_attr is
+        # {"weight_name": {"weight_attr_key": attr}}
+        # e.g.
+        # {
+        #   {
+        #     "layer.0.weight": {
+        #       "weight_loader": weight_loader_function_object,
+        #       "input_dim": 0, ...
+        #     },
+        #     "layer.1.weight": ...,
+        #    }
+        # }
+        for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items():
+            for attr_name, attr in weight_attr_dict.items():
+                module_name, weight_name = full_weight_name.rsplit(".", 1)
+                module = named_modules[module_name]
+                weight = getattr(module, weight_name)
+                if not hasattr(weight, attr_name):
+                    setattr(weight, attr_name, _bond_method_to_cls(attr, weight))
+
+        # Step R3: reload bfloat16 / high precision weights
+        updated_params = original_load_weights(
+            auto_weight_loader, weights, mapper=mapper
+        )
+
+        # Step R4: online quantize the weights
+        # manually process weights after loading
+        model.process_weights_after_loading_already_called = False
         if model_device is not None:
-            assert model_device == _device, (
-                "Expecting all weights "
-                "to be in the same device for now, got both: "
-                f"{model_device} and {_device}"
-            )
+            process_weights_after_loading(model, model_config, model_device)
         else:
-            model_device = _device
-
-        if name in existing_param_names:
-            module_name, weight_name = name.rsplit(".", 1)
-            module = named_modules[module_name]
-            setattr(
-                module,
-                weight_name,
-                torch.nn.Parameter(torch.empty(_shape, dtype=_dtype, device=_device)),
+            logger.warning_once(
+                "model_device is None, skip calling process_weights_after_loading"
             )
 
-    # recorded_weight_attr is
-    # {"weight_name": {"weight_attr_key": attr}}
-    # e.g.
-    # {
-    #   {
-    #     "layer.0.weight": {
-    #       "weight_loader": weight_loader_function_object,
-    #       "input_dim": 0, ...
-    #     },
-    #     "layer.1.weight": ...,
-    #    }
-    # }
-    for full_weight_name, weight_attr_dict in model.recorded_weight_attr.items():
-        for attr_name, attr in weight_attr_dict.items():
-            module_name, weight_name = full_weight_name.rsplit(".", 1)
-            module = named_modules[module_name]
-            weight = getattr(module, weight_name)
-            if not hasattr(weight, attr_name):
-                setattr(weight, attr_name, _bond_method_to_cls(attr, weight))
-
-    # Step I1: reload bfloat16 / high precision weights
-    loaded_weights = model.load_weights(
-        model_loader.get_all_weights(model_config, model)
-    )
-
-    # Step I2: online quantize the weights
-    # manually process weights after loading
-    model.process_weights_after_loading_already_called = False
-    process_weights_after_loading(model, model_config, model_device)
-    model.process_weights_after_loading_already_called = True
-    return loaded_weights
+        # Step R5 (workaround for cudagraph): restore the original quantized weights
+        # and do a copy_ of the currents weights to the original weights
+        updated_quantized_weights = dict(model.named_parameters(remove_duplicate=False))
+        for name in model.original_weights_rebuild_keys:
+            if name in original_quantized_weight_dict:
+                original_quantized_weight = original_quantized_weight_dict[name]
+                updated_quantized_weight = updated_quantized_weights[name]
+
+                module_name, weight_name = name.rsplit(".", 1)
+                module = named_modules[module_name]
+                setattr(module, weight_name, original_quantized_weight)
+                with torch.no_grad():
+                    original_quantized_weight.copy_(updated_quantized_weight)
+
+        del original_quantized_weight_dict
+        del named_modules
+        del updated_quantized_weight
+
+        model.process_weights_after_loading_already_called = True
+        return updated_params
+
+    return patched_model_load_weights
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index d94dbd9f06e0..1538f0c2af65 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -4,6 +4,7 @@
 import collections
 import glob
 import os
+import time
 from collections.abc import Generator
 from typing import Any
 
@@ -132,6 +133,7 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
                 f"pre-sharded checkpoints are currently supported!"
             )
         state_dict = self._filter_subtensors(model.state_dict())
+        counter_before_loading_weights = time.perf_counter()
         for key, tensor in self.iterate_over_files(filepaths):
             # If loading with LoRA enabled, additional padding may
             # be added to certain parameters. We only load into a
@@ -150,6 +152,12 @@ def load_weights(self, model: nn.Module, model_config: ModelConfig) -> None:
                 )
             param_data.copy_(tensor)
             state_dict.pop(key)
+        counter_after_loading_weights = time.perf_counter()
+        logger.info_once(
+            "Loading weights took %.2f seconds",
+            counter_after_loading_weights - counter_before_loading_weights,
+            scope="local",
+        )
         if state_dict:
             raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
 
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index ba708a098c0d..eeb2444150ee 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -11,20 +11,13 @@
 from torch import nn
 from typing_extensions import assert_never
 
-from vllm.attention import Attention
-from vllm.attention.layer import MLAAttention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.models.adapters import (
-    as_embedding_model,
-    as_reward_model,
-    as_seq_cls_model,
-    try_create_mm_pooling_model_cls,
-)
 from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
 from vllm.utils.platform_utils import is_pin_memory_available
 
@@ -88,6 +81,14 @@ def initialize_model(
 def process_weights_after_loading(
     model: nn.Module, model_config: ModelConfig, target_device: torch.device
 ) -> None:
+    if getattr(model, "process_weights_after_loading_already_called", False):
+        # In case `process_weights_after_loading` is called multiple times
+        # we'll skip it at later times
+        logger.debug_once(
+            "process_weights_after_loading already called for model %s", model
+        )
+        return
+
     # to avoid circular dependency
     from vllm.model_executor.model_loader.online_quantization import (
         maybe_save_metadata_and_attributes_for_weight_reloading,
@@ -164,6 +165,13 @@ def device_loading_context(module: torch.nn.Module, target_device: torch.device)
 
 
 def _get_model_architecture(model_config: ModelConfig) -> tuple[type[nn.Module], str]:
+    from vllm.model_executor.models.adapters import (
+        as_embedding_model,
+        as_reward_model,
+        as_seq_cls_model,
+        try_create_mm_pooling_model_cls,
+    )
+
     architectures = getattr(model_config.hf_config, "architectures", [])
 
     model_cls, arch = model_config.registry.resolve_model_cls(
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 93986e5f2fc0..0809bdfa9d4c 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -369,6 +369,52 @@ def get_sparse_attention_config(
     return config
 
 
+def download_gguf(
+    repo_id: str,
+    quant_type: str,
+    cache_dir: str | None = None,
+    revision: str | None = None,
+    ignore_patterns: str | list[str] | None = None,
+) -> str:
+    # Use patterns that snapshot_download can handle directly
+    # Patterns to match:
+    # - *-{quant_type}.gguf (root)
+    # - *-{quant_type}-*.gguf (root sharded)
+    # - */*-{quant_type}.gguf (subdir)
+    # - */*-{quant_type}-*.gguf (subdir sharded)
+    allow_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+
+    # Use download_weights_from_hf which handles caching and downloading
+    folder = download_weights_from_hf(
+        model_name_or_path=repo_id,
+        cache_dir=cache_dir,
+        allow_patterns=allow_patterns,
+        revision=revision,
+        ignore_patterns=ignore_patterns,
+    )
+
+    # Find the downloaded file(s) in the folder
+    local_files = []
+    for pattern in allow_patterns:
+        # Convert pattern to glob pattern for local filesystem
+        glob_pattern = os.path.join(folder, pattern)
+        local_files.extend(glob.glob(glob_pattern))
+
+    if not local_files:
+        raise ValueError(
+            f"Downloaded GGUF files not found in {folder} for quant_type {quant_type}"
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    local_files.sort(key=lambda x: (x.count("-"), x))
+    return local_files[0]
+
+
 def download_weights_from_hf(
     model_name_or_path: str,
     cache_dir: str | None,
@@ -595,6 +641,9 @@ def safetensors_weights_iterator(
     if safetensors_load_strategy == "eager":
         loading_desc += " (eager)"
 
+    state_dict = {}
+    leftover_state_dict: dict[str, torch.Tensor] = {}
+
     for st_file in tqdm(
         hf_weights_files,
         desc=loading_desc,
@@ -606,9 +655,11 @@ def safetensors_weights_iterator(
                 state_dict = load(f.read())
             yield from state_dict.items()
         elif safetensors_load_strategy == "torchao":
-            if not torchao_version_at_least("0.14.0"):
+            # we can't load flattened torchao tensor subclasses directly into the model
+            # instead we reconstruct the subclasses here before returning
+            if not torchao_version_at_least("0.15.0"):
                 raise ValueError(
-                    "Please use torchao version >= 0.14.0 \
+                    "Please use torchao version >= 0.15.0 \
                         to load torchao safetensors checkpoint"
                 )
             from torchao.prototype.safetensors.safetensors_support import (
@@ -616,12 +667,20 @@ def safetensors_weights_iterator(
             )
 
             with safe_open(st_file, framework="pt") as f:
-                state_dict = {}
                 for name in f.keys():  # noqa: SIM118
                     state_dict[name] = f.get_tensor(name)
+
+                # update with leftover tensor data from previous iteration, if any
+                state_dict.update(leftover_state_dict)
                 metadata = f.metadata()
-                updated_state_dict = unflatten_tensor_state_dict(state_dict, metadata)
-            yield from updated_state_dict.items()
+                # due to sharded checkpoints, we are not guaranteed that we have all
+                # tensor subclass data on one file
+                # state_dict has the leftover data from this step and we wait for
+                # missing information to be provided in a future iteration
+                unflattened_state_dict, leftover_state_dict = (
+                    unflatten_tensor_state_dict(state_dict, metadata)
+                )
+            yield from unflattened_state_dict.items()
         else:
             with safe_open(st_file, framework="pt") as f:
                 for name in f.keys():  # noqa: SIM118
@@ -836,7 +895,11 @@ def gguf_quant_weights_iterator(
 ) -> Generator[tuple[str, torch.Tensor], None, None]:
     """
     Iterate over the quant weights in the model gguf files and convert
-    them to torch tensors
+    them to torch tensors.
+    Be careful of the order of yielding weight types and weights data,
+    we have to yield all weight types first before yielding any weights.
+    Otherwise it would cause issue when loading weights with for packed
+    layer with different quant types.
     """
 
     reader = gguf.GGUFReader(gguf_file)
@@ -846,7 +909,7 @@ def gguf_quant_weights_iterator(
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
 
-            if weight_type.name != "F32":
+            if weight_type.name not in ("F32", "BF16", "F16"):
                 weight_type_name = name.replace("weight", "qweight_type")
                 weight_type = torch.tensor(weight_type)
                 yield weight_type_name, weight_type
@@ -856,7 +919,7 @@ def gguf_quant_weights_iterator(
             weight = tensor.data
             weight_type = tensor.tensor_type
             name = gguf_to_hf_name_map[tensor.name]
-            if weight_type.name != "F32":
+            if weight_type.name not in ("F32", "BF16", "F16"):
                 name = name.replace("weight", "qweight")
             param = torch.tensor(weight)
             yield name, param
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index f742090df71f..05f257feea3e 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -14,9 +14,9 @@
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.models.config import VerifyAndUpdateConfig
 from vllm.transformers_utils.config import (
-    get_hf_file_bytes,
     try_get_dense_modules,
 )
+from vllm.transformers_utils.repo_utils import get_hf_file_bytes
 
 from .interfaces_base import VllmModelForPooling, is_pooling_model
 
@@ -301,7 +301,7 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""):
             quant_config = vllm_config.quant_config
 
             self.score = ReplicatedLinear(
-                model_config.hidden_size,
+                model_config.get_hidden_size(),
                 text_config.num_labels,
                 bias=False,
                 params_dtype=vllm_config.model_config.head_dtype,
@@ -428,8 +428,8 @@ def load_weights_using_from_2_way_softmax(
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
-        # the Transformers backend.
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
+        # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
             if hasattr(model.model, "embed_tokens")
@@ -486,8 +486,8 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
     )
     if text_config.tie_word_embeddings:
         # embed_tokens is the assumed name for input embeddings. If the model does not
-        # have this attribute, we fallback to get_input_embeddings(), which is used by
-        # the Transformers backend.
+        # have this attribute, we fall back to get_input_embeddings(), which is used by
+        # the Transformers modeling backend.
         embed_tokens = (
             model.model.embed_tokens
             if hasattr(model.model, "embed_tokens")
diff --git a/vllm/model_executor/models/afmoe.py b/vllm/model_executor/models/afmoe.py
new file mode 100644
index 000000000000..85827d54c911
--- /dev/null
+++ b/vllm/model_executor/models/afmoe.py
@@ -0,0 +1,697 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only AfMoE model compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
+from vllm.distributed import (
+    get_ep_group,
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import SupportsLoRA, SupportsPP
+from vllm.model_executor.models.llama import LlamaMLP as AfmoeMLP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    WeightsMapper,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.sequence import IntermediateTensors
+
+logger = init_logger(__name__)
+
+
+class AfmoeMoE(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ):
+        super().__init__()
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.route_scale = config.route_scale
+        self.score_func = config.score_func
+        self.route_norm = config.route_norm
+
+        self.ep_group = get_ep_group().device_group
+        self.ep_rank = self.ep_group.rank()
+        self.ep_size = self.ep_group.size()
+        self.n_routed_experts: int = config.num_experts
+        self.n_shared_experts: int = config.num_shared_experts
+
+        if config.hidden_act != "silu":
+            raise ValueError(
+                f"Unsupported activation: {config.hidden_act}. "
+                "Only silu is supported for now."
+            )
+
+        # Router gate
+        self.gate = nn.Linear(
+            config.hidden_size,
+            config.num_experts,
+            bias=False,
+            dtype=torch.float32,
+        )
+        self.expert_bias = nn.Parameter(
+            torch.empty(config.num_experts, dtype=torch.float32)
+        )
+
+        # Load balancing settings
+        vllm_config = get_current_vllm_config()
+        eplb_config = vllm_config.parallel_config.eplb_config
+        self.enable_eplb = enable_eplb
+
+        self.n_redundant_experts = eplb_config.num_redundant_experts
+        self.n_logical_experts = self.n_routed_experts
+        self.n_physical_experts = self.n_logical_experts + self.n_redundant_experts
+        self.n_local_physical_experts = self.n_physical_experts // self.ep_size
+
+        self.physical_expert_start = self.ep_rank * self.n_local_physical_experts
+        self.physical_expert_end = (
+            self.physical_expert_start + self.n_local_physical_experts
+        )
+
+        self.shared_experts = None
+        # Shared experts
+        if config.num_shared_experts > 0:
+            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
+            self.shared_experts = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                reduce_results=False,
+                prefix=f"{prefix}.shared_experts",
+            )
+
+        # Routed experts using SharedFusedMoE
+        self.experts = SharedFusedMoE(
+            shared_experts=self.shared_experts,
+            num_experts=config.num_experts,
+            top_k=config.num_experts_per_tok,
+            hidden_size=config.hidden_size,
+            intermediate_size=config.moe_intermediate_size,
+            reduce_results=False,
+            renormalize=self.route_norm if self.score_func == "sigmoid" else False,
+            quant_config=quant_config,
+            use_grouped_topk=True,
+            num_expert_group=config.n_group,
+            topk_group=config.topk_group,
+            prefix=f"{prefix}.experts",
+            scoring_func=self.score_func,
+            routed_scaling_factor=self.route_scale,
+            e_score_correction_bias=self.expert_bias,
+            enable_eplb=self.enable_eplb,
+            num_redundant_experts=self.n_redundant_experts,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        num_tokens, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        router_logits = self.gate(hidden_states.to(dtype=torch.float32))
+
+        fused_moe_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
+
+        if self.shared_experts is not None:
+            shared_output, final_hidden_states = fused_moe_out
+            final_hidden_states = final_hidden_states + shared_output
+        else:
+            final_hidden_states = fused_moe_out
+        if self.tp_size > 1:
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
+
+        return final_hidden_states.view(num_tokens, hidden_dim)
+
+
+class AfmoeAttention(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        layer_idx: int,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        max_position_embeddings: int = 131072,
+        head_dim: int | None = None,
+        rms_norm_eps: float = 1e-05,
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = head_dim or (hidden_size // self.total_num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.max_position_embeddings = max_position_embeddings
+
+        # Check if this is a local attention layer
+        self.is_local_attention = config.layer_types[layer_idx] == "sliding_attention"
+        self.sliding_window = config.sliding_window if self.is_local_attention else None
+
+        self.qkv_proj = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            self.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        # Gating projection
+        self.gate_proj = ColumnParallelLinear(
+            hidden_size,
+            self.total_num_heads * self.head_dim,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.gate_proj",
+        )
+
+        # Q/K normalization
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+        # Only create rotary embeddings for local attention
+        if self.is_local_attention:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                rope_parameters=config["rope_parameters"],
+                is_neox_style=True,
+            )
+        else:
+            self.rotary_emb = None
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            per_layer_sliding_window=self.sliding_window,
+            prefix=f"{prefix}.attn",
+            attn_type=attn_type,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        gate, _ = self.gate_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        # Apply Q/K normalization
+        q = self.q_norm(q.reshape(-1, self.num_heads, self.head_dim)).reshape(q.shape)
+        k = self.k_norm(k.reshape(-1, self.num_kv_heads, self.head_dim)).reshape(
+            k.shape
+        )
+
+        # Apply rotary embeddings only for local attention
+        if self.is_local_attention and self.rotary_emb is not None:
+            q, k = self.rotary_emb(positions, q, k)
+
+        attn_output = self.attn(q, k, v)
+
+        # Apply gating
+        attn_output = attn_output * torch.sigmoid(gate)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AfmoeDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config,  # AfmoeConfig
+        cache_config: CacheConfig | None = None,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        enable_eplb: bool = False,
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
+
+        # DecoderLayers are created with `make_layers` which passes the prefix
+        # with the layer's index.
+        self.layer_idx = extract_layer_index(prefix)
+
+        self.self_attn = AfmoeAttention(
+            config=config,
+            layer_idx=self.layer_idx,
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position_embeddings=max_position_embeddings,
+            head_dim=config.head_dim,
+            rms_norm_eps=config.rms_norm_eps,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.self_attn",
+        )
+
+        # MoE or dense FFN
+        self.moe_enabled = self.layer_idx >= config.num_dense_layers
+        if self.moe_enabled:
+            self.mlp = AfmoeMoE(
+                config=config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+                enable_eplb=enable_eplb,
+            )
+        else:
+            self.mlp = AfmoeMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.pre_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_mlp_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        hidden_states = self.post_attention_layernorm(hidden_states)  # attn norm b
+
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_layernorm(  # ffn norm a
+            hidden_states, residual
+        )
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_layernorm(hidden_states)  # ffn norm b
+
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class AfmoeModel(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        enable_eplb = vllm_config.parallel_config.enable_eplb
+        self.config = config
+
+        self.vocab_size = config.vocab_size
+        self.mup_enabled = config.mup_enabled
+
+        if get_pp_group().is_first_rank:
+            self.embed_tokens = VocabParallelEmbedding(
+                config.vocab_size, config.hidden_size, prefix=f"{prefix}.embed_tokens"
+            )
+        else:
+            self.embed_tokens = PPMissingLayer()
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: AfmoeDecoderLayer(
+                config=config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+                enable_eplb=enable_eplb,
+            ),
+            prefix=f"{prefix}.layers",
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+
+            # Apply muP input scaling if enabled
+            if self.mup_enabled:
+                hidden_states = hidden_states * (self.config.hidden_size**0.5)
+
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(positions, hidden_states, residual)
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+    def make_empty_intermediate_tensors(
+        self, batch_size: int, dtype: torch.dtype, device: torch.device
+    ) -> IntermediateTensors:
+        return IntermediateTensors(
+            {
+                "hidden_states": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+                "residual": torch.zeros(
+                    (batch_size, self.config.hidden_size), dtype=dtype, device=device
+                ),
+            }
+        )
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return SharedFusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.num_experts,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if (weight_name not in name) or ("self_attn.gate_proj" in name):
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if ("mlp.experts." in name) and name not in params_dict:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+
+                    # Anyway, this is an expert weight and should not be
+                    # attempted to load as other weights later
+                    is_expert_weight = True
+
+                    # Do not modify `name` since the loop may continue here
+                    # Instead, create a new variable
+                    name_mapped = name.replace(weight_name, param_name)
+
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+
+                    param = params_dict[name_mapped]
+                    # We should ask the weight loader to return success or not
+                    # here since otherwise we may skip experts with other
+                    # available replicas.
+                    weight_loader = typing.cast(
+                        Callable[..., bool], param.weight_loader
+                    )
+                    success = weight_loader(
+                        param,
+                        loaded_weight,
+                        name_mapped,
+                        shard_id=shard_id,
+                        expert_id=expert_id,
+                        return_success=True,
+                    )
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+
+                    # Remapping the name of FP8 kv-scale.
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+
+                    if is_pp_missing_parameter(name, self):
+                        continue
+
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
+
+class AfmoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_suffix={
+            ".router.gate.weight": ".gate.weight",
+        },
+    )
+
+    fall_back_to_pt_during_load = False
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        self.config = config
+        self.quant_config = quant_config
+        self.model = AfmoeModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+        self.expert_weights = []
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = config.num_hidden_layers - config.num_dense_layers
+        self.num_expert_groups = config.n_group
+
+        self.moe_layers: list[SharedFusedMoE] = []
+        example_moe = None
+        for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
+            assert isinstance(layer, AfmoeDecoderLayer)
+            if layer.moe_enabled:
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None and self.num_moe_layers > 0:
+            raise RuntimeError("No AfmoeMoE layer found in model.layers.")
+
+        if example_moe is not None:
+            self.num_logical_experts = example_moe.n_logical_experts
+            self.num_physical_experts = example_moe.n_physical_experts
+            self.num_local_physical_experts = example_moe.n_local_physical_experts
+            self.num_routed_experts = example_moe.n_routed_experts
+            self.num_shared_experts = example_moe.n_shared_experts
+            self.num_redundant_experts = example_moe.n_redundant_experts
+
+    def set_eplb_state(
+        self,
+        expert_load_view: torch.Tensor,
+        logical_to_physical_map: torch.Tensor,
+        logical_replica_count: torch.Tensor,
+    ) -> None:
+        for layer_idx, layer in enumerate(self.moe_layers):
+            # Register the expert weights.
+            self.expert_weights.append(layer.get_expert_weights())
+            layer.set_eplb_state(
+                moe_layer_idx=layer_idx,
+                expert_load_view=expert_load_view,
+                logical_to_physical_map=logical_to_physical_map,
+                logical_replica_count=logical_replica_count,
+            )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(self)
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
diff --git a/vllm/model_executor/models/aimv2.py b/vllm/model_executor/models/aimv2.py
index 5872e8196ead..3d000f3ac3ab 100644
--- a/vllm/model_executor/models/aimv2.py
+++ b/vllm/model_executor/models/aimv2.py
@@ -12,6 +12,7 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.distributed.utils import divide
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -58,7 +59,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 class AIMv2PatchEmbed(nn.Module):
     def __init__(self, config: AIMv2Config):
         super().__init__()
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             config.num_channels,
             config.hidden_size,
             kernel_size=(config.patch_size, config.patch_size),
diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py
index 0a8f21abb0a3..4a69787af55e 100644
--- a/vllm/model_executor/models/apertus.py
+++ b/vllm/model_executor/models/apertus.py
@@ -27,13 +27,13 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import ApertusConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -118,8 +118,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -155,7 +153,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,9 +173,7 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
@@ -224,7 +219,6 @@ def forward(
     def _init_rotary_emb(
         self,
         config: ApertusConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -236,8 +230,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=int(self.partial_rotary_factor * self.head_dim),
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -253,14 +246,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -288,8 +273,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
@@ -499,7 +482,6 @@ class ApertusForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(
         self,
diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py
index 20c3ff075450..b3887b16f4d7 100644
--- a/vllm/model_executor/models/arcee.py
+++ b/vllm/model_executor/models/arcee.py
@@ -103,15 +103,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Rotary embedding parameters (reuse LLaMA defaults)
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Determine if attention bias is needed (some variants use bias terms)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
@@ -133,8 +124,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index b5cc07a56535..266d29a8d9b2 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -8,7 +8,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -292,7 +292,6 @@ def __init__(
         self.kv_size = self.num_kv_heads * self.head_dim
 
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.scaling = self.head_dim**-0.5
 
         self.qkv_proj = QKVParallelLinear(
@@ -317,7 +316,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 8991ef4c606b..beb22995a071 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -29,7 +29,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -136,7 +136,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         position_embedding: str,
-        rope_theta: float = 10000,
+        rope_parameters: dict,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -150,7 +150,6 @@ def __init__(
         self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = hidden_size // self.total_num_heads
         self.position_embedding = position_embedding
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         # pylint: disable=invalid-name
@@ -192,7 +191,7 @@ def __init__(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=self.max_position_embeddings,
-                base=self.rope_theta,
+                rope_parameters=rope_parameters,
             )
             self.scaling = self.head_dim**-0.5
             self.attn = Attention(
@@ -229,13 +228,12 @@ def __init__(
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = BaiChuanAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             position_embedding=position_embedding,
-            rope_theta=rope_theta,
+            rope_parameters=getattr(config, "rope_parameters", None),
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py
index a87813402256..f7a5d4e7889e 100644
--- a/vllm/model_executor/models/bailing_moe.py
+++ b/vllm/model_executor/models/bailing_moe.py
@@ -32,14 +32,13 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
     get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
-    tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import SharedFusedMoE
@@ -136,9 +135,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
-            rope_scaling=config.rope_scaling,
             partial_rotary_factor=self.partial_rotary_factor,
         )
 
@@ -330,7 +328,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             final_hidden_states = final_hidden_states + shared_output
 
         if self.tp_size > 1:
-            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+            final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel(
+                final_hidden_states
+            )
         return final_hidden_states.view(num_tokens, hidden_size)
 
 
@@ -598,7 +598,7 @@ def __init__(
                     config.vocab_size,
                     config.hidden_size,
                     quant_config=quant_config,
-                    prefix=f"{prefix}.lm_head",
+                    prefix=maybe_prefix(prefix, "lm_head"),
                 )
             self.logits_processor = LogitsProcessor(config.vocab_size)
         else:
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
index e0a2defd5127..1d6493b18c34 100644
--- a/vllm/model_executor/models/bamba.py
+++ b/vllm/model_executor/models/bamba.py
@@ -138,8 +138,7 @@ def forward(
         else:
             hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
-        output = torch.empty_like(hidden_states)
-        self.mamba(hidden_states, output)
+        output = self.mamba(hidden_states)
         # Fully Connected
         hidden_states, residual = self.pre_ff_layernorm(output, residual)
         hidden_states = self.feed_forward(hidden_states)
@@ -157,8 +156,6 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -179,7 +176,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if hasattr(config, "partial_rotary_factor"):
@@ -193,8 +189,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
             dtype=torch.get_default_dtype(),  # see impl of get_rope
         )
@@ -424,7 +419,6 @@ class BambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 2679448bce77..e774cd647ea8 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -32,7 +32,7 @@
 from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding, SupportsQuant
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type, default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -432,7 +432,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-@default_pooling_type("ALL")
 class BertPoolingModel(BertModel):
     is_pooling_model = True
 
@@ -864,6 +863,7 @@ def forward(
         )
 
 
+@attn_type("encoder_only")
 @default_pooling_type("ALL")
 class BertForTokenClassification(nn.Module):
     is_pooling_model = True
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 2e4f73312efa..f31f99c0592b 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -12,6 +12,7 @@
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -47,7 +48,7 @@ def __init__(self, config: BlipVisionConfig | Blip2VisionConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=3,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 00fba93423d8..507fbf1fdd0a 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -27,7 +27,7 @@
 from torch import nn
 from transformers import BloomConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index fb7476c45fcd..3aa01bb1905f 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -16,12 +16,13 @@
     ChameleonVQVAEConfig,
 )
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -264,8 +265,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 4096,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -292,7 +292,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -317,8 +316,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         self.attn = Attention(
@@ -368,14 +366,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
@@ -384,8 +374,7 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
@@ -438,14 +427,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         self.self_attn = ChameleonAttention(
@@ -454,8 +435,7 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
@@ -549,7 +529,7 @@ def forward(self, hidden_state: torch.Tensor):
 class ChameleonVQVAEEncoderConvDownsample(nn.Module):
     def __init__(self, in_channels: int):
         super().__init__()
-        self.conv = nn.Conv2d(
+        self.conv = Conv2dLayer(
             in_channels, in_channels, kernel_size=3, stride=2, padding=0
         )
 
@@ -577,23 +557,23 @@ def __init__(
         self.norm1 = torch.nn.GroupNorm(
             num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
         )
-        self.conv1 = torch.nn.Conv2d(
+        self.conv1 = Conv2dLayer(
             in_channels, out_channels, kernel_size=3, stride=1, padding=1
         )
         self.norm2 = torch.nn.GroupNorm(
             num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
         )
         self.dropout = torch.nn.Dropout(config.dropout)
-        self.conv2 = torch.nn.Conv2d(
+        self.conv2 = Conv2dLayer(
             out_channels, out_channels, kernel_size=3, stride=1, padding=1
         )
         if self.in_channels != self.out_channels:
             if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(
+                self.conv_shortcut = Conv2dLayer(
                     in_channels, out_channels, kernel_size=3, stride=1, padding=1
                 )
             else:
-                self.nin_shortcut = torch.nn.Conv2d(
+                self.nin_shortcut = Conv2dLayer(
                     in_channels, out_channels, kernel_size=1, stride=1, padding=0
                 )
 
@@ -626,16 +606,16 @@ def __init__(self, in_channels: int):
         self.norm = torch.nn.GroupNorm(
             num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
         )
-        self.q = torch.nn.Conv2d(
+        self.q = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
-        self.k = torch.nn.Conv2d(
+        self.k = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
-        self.v = torch.nn.Conv2d(
+        self.v = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
-        self.proj_out = torch.nn.Conv2d(
+        self.proj_out = Conv2dLayer(
             in_channels, in_channels, kernel_size=1, stride=1, padding=0
         )
 
@@ -681,7 +661,7 @@ def __init__(self, config: ChameleonVQVAEConfig):
         latent_channels = config.latent_channels
         channel_multiplier = config.channel_multiplier
 
-        self.conv_in = torch.nn.Conv2d(
+        self.conv_in = Conv2dLayer(
             in_channels, base_channels, kernel_size=3, stride=1, padding=1
         )
 
@@ -738,7 +718,7 @@ def __init__(self, config: ChameleonVQVAEConfig):
         self.norm_out = torch.nn.GroupNorm(
             num_groups=32, num_channels=block_in, eps=1e-6, affine=True
         )
-        self.conv_out = torch.nn.Conv2d(
+        self.conv_out = Conv2dLayer(
             block_in,
             2 * latent_channels if double_latent else latent_channels,
             kernel_size=3,
@@ -779,10 +759,8 @@ def __init__(self, config: ChameleonVQVAEConfig):
         super().__init__()
         self.encoder = ChameleonVQVAEEncoder(config)
         self.quantize = ChameleonVQVAEVectorQuantizer(config)
-        self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(
-            config.embed_dim, config.latent_channels, 1
-        )
+        self.quant_conv = Conv2dLayer(config.latent_channels, config.embed_dim, 1)
+        self.post_quant_conv = Conv2dLayer(config.embed_dim, config.latent_channels, 1)
         self.eval()  # Chameleon's VQ model is frozen
 
     def encode(
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 5d6f5e9125a2..3d485fdd0a2e 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -12,7 +12,7 @@
 from torch import nn
 from torch.nn import LayerNorm
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -99,6 +99,7 @@ def __init__(
         # https://huggingface.co/zai-org/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
         rope_ratio = getattr(config, "rope_ratio", 1.0)
         max_positions = getattr(config, "seq_length", 8192)
+        rope_parameters = {"rope_type": "default", "rope_theta": 10000 * rope_ratio}
         # NOTE: zai-org/cogagent-9b-20241220 uses original_rope=False,
         # which is equivalent to is_neox_style=True
         is_neox_style = not config.original_rope
@@ -106,7 +107,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim // 2,
             max_position=max_positions,
-            base=10000 * rope_ratio,
+            rope_parameters=rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 50f476dfd185..b8af3050990b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable, Mapping, Sequence
+from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import cached_property
 from typing import Annotated, Literal
 
@@ -14,12 +14,12 @@
     CLIPVisionConfig,
 )
 
-from vllm.attention import Attention
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -315,7 +315,7 @@ def __init__(self, config: CLIPVisionConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -903,6 +903,41 @@ def _process_image_inputs(self, inputs: CLIPImagePixelInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
+    def _embed_text_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
+        *,
+        is_multimodal: torch.Tensor | None,
+        handle_oov_mm_token: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = super()._embed_text_input_ids(
+            input_ids,
+            embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        # NOTE: inputs_embeds in model runner has size text_config.projection_dim
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        inputs_embeds_size = self.projection_dim
+        if inputs_embeds.shape[1] < inputs_embeds_size:
+            inputs_embeds = torch.cat(
+                [
+                    inputs_embeds,
+                    inputs_embeds.new_empty(
+                        inputs_embeds.shape[0],
+                        inputs_embeds_size - inputs_embeds.shape[1],
+                    ),
+                ],
+                dim=1,
+            )
+        elif inputs_embeds.shape[1] > inputs_embeds_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return inputs_embeds
+
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -949,10 +984,16 @@ def forward(
         if not self._is_text_input:
             return inputs_embeds
 
-        # Text inputs
-        return self.get_text_features(
-            input_ids=input_ids, position_ids=positions, inputs_embeds=inputs_embeds
-        )
+        # NOTE: inputs_embeds in model runner has size text_config.projection_dim
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        hidden_size = self.text_embed_dim
+        if inputs_embeds.shape[1] > hidden_size:
+            inputs_embeds = inputs_embeds[:, :hidden_size]
+        elif inputs_embeds.shape[1] < hidden_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return self.get_text_features(input_ids, positions, inputs_embeds)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 77bb17851981..f837502c468f 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import Cohere2Config, CohereConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -156,8 +156,6 @@ def __init__(
         self.max_position_embeddings = getattr(
             config, "model_max_length", None
         ) or getattr(config, "max_position_embeddings", 8192)
-        self.rope_theta = config.rope_theta
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.qkv_proj = QKVParallelLinear(
             self.hidden_size,
@@ -179,8 +177,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
 
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 66b246878b0a..d7e802ba1aca 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -8,6 +8,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
+from vllm.transformers_utils.config import set_default_rope_theta
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import FullAttentionSpec, MambaSpec, MLAAttentionSpec
@@ -46,8 +47,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             "head_size": head_dim,
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
-            "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
 
@@ -78,12 +78,13 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             if not model_config.enforce_eager:
                 max_position = round_up(max_position, 8)
 
+            set_default_rope_theta(config, default_theta=config.rotary_emb_base)
+
             config.rotary_kwargs = {
                 "head_size": head_dim,
                 "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
                 "max_position": max_position,
-                "base": getattr(config, "rope_theta", config.rotary_emb_base),
-                "rope_scaling": getattr(config, "rope_scaling", None),
+                "rope_parameters": config.rope_parameters,
             }
 
 
@@ -117,18 +118,20 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
         head_dim = config.hidden_size // config.num_attention_heads
         rotary_emb_dim = int(head_dim * config.rotary_emb_fraction)
         max_trained_positions = getattr(config, "max_trained_positions", 2048)
+
+        set_default_rope_theta(config, default_theta=config.rotary_emb_base)
+
         config.rotary_kwargs = {
             "head_size": head_dim,
             "rotary_dim": rotary_emb_dim,
             "max_position": max_trained_positions,
-            "base": getattr(config, "rope_theta", config.rotary_emb_base),
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
         # we ignore config.rotary_scaling_factor so that for datasets shorter
         # than max_trained_positions 2048, the results are consistent
         # with SentenceTransformer.
-        # The context extension uses vllm style rope_theta and rope_scaling.
+        # The context extension uses vllm style rope_theta and rope_parameters.
         # See #17785 #18755
         if (
             not vllm_config.model_config.hf_overrides
@@ -172,7 +175,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             if hasattr(hf_text_config, "max_model_len"):
                 delattr(hf_text_config, "max_model_len")
             hf_text_config.max_position_embeddings = max_trained_positions
-            hf_text_config.rope_scaling = config.rotary_kwargs["rope_scaling"]
+            hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"]
 
             # The priority of sentence_bert_config.json is higher
             # than max_position_embeddings
@@ -246,8 +249,7 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None:
             "head_size": head_dim,
             "rotary_dim": getattr(config, "rotary_emb_dim", head_dim),
             "max_position": config.max_position_embeddings,
-            "base": config.rope_theta,
-            "rope_scaling": getattr(config, "rope_scaling", None),
+            "rope_parameters": config.rope_parameters,
         }
 
 
@@ -287,9 +289,6 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         model_config = vllm_config.model_config
         cache_config = vllm_config.cache_config
 
-        if cache_config.mamba_block_size is None:
-            cache_config.mamba_block_size = model_config.max_model_len
-
         if cache_config.enable_prefix_caching:
             if model_config.supports_mamba_prefix_caching:
                 logger.info(
@@ -297,6 +296,11 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                     "Its support for Mamba layers is experimental. "
                     "Please report any issues you may observe."
                 )
+                # By default, mamba block size will be set to max_model_len (see
+                # below). When enabling prefix caching, we align mamba block size
+                # to the block size as the basic granularity for prefix caching.
+                if cache_config.mamba_block_size is None:
+                    cache_config.mamba_block_size = cache_config.block_size
             else:
                 logger.info(
                     "Hybrid or mamba-based model detected without "
@@ -304,6 +308,9 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 )
                 cache_config.enable_prefix_caching = False
 
+        if cache_config.mamba_block_size is None:
+            cache_config.mamba_block_size = model_config.max_model_len
+
         # TODO(tdoublep): remove once cascade attention is supported
         logger.info(
             "Disabling cascade attention since it is not supported for hybrid models."
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 528ef4f76742..946baffc8817 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -8,7 +8,7 @@
 import torch.nn as nn
 from transformers import DbrxConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
     get_pp_group,
@@ -197,7 +197,10 @@ def __init__(
         self.head_dim = self.d_model // self.total_num_heads
         self.total_num_kv_heads = config.attn_config.kv_n_heads
         self.clip_qkv = config.attn_config.clip_qkv
-        self.rope_theta = config.attn_config.rope_theta
+        rope_parameters = {
+            "rope_type": "default",
+            "rope_theta": int(config.attn_config.rope_theta),
+        }
         self.max_position = config.max_seq_len
 
         # pylint: disable=invalid-name
@@ -221,7 +224,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
 
diff --git a/vllm/model_executor/models/deepencoder.py b/vllm/model_executor/models/deepencoder.py
index e62a57eccc95..8f1660891fcb 100644
--- a/vllm/model_executor/models/deepencoder.py
+++ b/vllm/model_executor/models/deepencoder.py
@@ -19,6 +19,7 @@
 from transformers import CLIPVisionConfig
 
 from vllm.attention.layer import MultiHeadAttention
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
@@ -133,14 +134,14 @@ def __init__(
             self.blocks.append(block)
 
         self.neck = nn.Sequential(
-            nn.Conv2d(
+            Conv2dLayer(
                 embed_dim,
                 out_chans,
                 kernel_size=1,
                 bias=False,
             ),
             LayerNorm2d(out_chans),
-            nn.Conv2d(
+            Conv2dLayer(
                 out_chans,
                 out_chans,
                 kernel_size=3,
@@ -150,8 +151,10 @@ def __init__(
             LayerNorm2d(out_chans),
         )
 
-        self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
-        self.net_3 = nn.Conv2d(
+        self.net_2 = Conv2dLayer(
+            256, 512, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.net_3 = Conv2dLayer(
             512, 1024, kernel_size=3, stride=2, padding=1, bias=False
         )
 
@@ -500,7 +503,7 @@ def __init__(
         """
         super().__init__()
 
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
         )
 
diff --git a/vllm/model_executor/models/deepseek_eagle.py b/vllm/model_executor/models/deepseek_eagle.py
index 9e834a73f8e5..8f6b4a4b021f 100644
--- a/vllm/model_executor/models/deepseek_eagle.py
+++ b/vllm/model_executor/models/deepseek_eagle.py
@@ -8,7 +8,6 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -24,11 +23,8 @@
     DeepseekV2DecoderLayer,
     DeepseekV3ForCausalLM,
 )
-from vllm.utils import init_logger
 
-from .utils import AutoWeightsLoader, maybe_prefix
-
-logger = init_logger(__name__)
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 
 @support_torch_compile
@@ -172,10 +168,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     )
                     break
                 else:
-                    # if PP disabled then draft will share embed with target
-                    if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                        continue
-
                     # Skip loading extra bias for GPTQ models.
                     if name.endswith(".bias") and name not in params_dict:
                         continue
@@ -250,6 +242,7 @@ def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index e028dc497aa6..6e23037b919a 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -1,15 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Iterable
+import typing
+from collections.abc import Callable, Iterable
 
 import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe import FusedMoE
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -231,6 +233,9 @@ def compute_logits(
         return self.model.compute_logits(hidden_states, spec_step_idx)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        rocm_aiter_moe_shared_expert_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
         stacked_params_mapping = [
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
@@ -238,11 +243,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
         ]
 
-        expert_params_mapping = FusedMoE.make_expert_params_mapping(
+        expert_params_mapping = SharedFusedMoE.make_expert_params_mapping(
             ckpt_gate_proj_name="gate_proj",
             ckpt_down_proj_name="down_proj",
             ckpt_up_proj_name="up_proj",
-            num_experts=self.config.n_routed_experts,
+            num_experts=self.config.n_routed_experts
+            + (
+                self.config.n_shared_experts
+                if rocm_aiter_moe_shared_expert_enabled
+                else 0
+            ),
         )
 
         params_dict = dict(self.named_parameters())
@@ -253,6 +263,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
             if spec_layer is None:
                 continue
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
+            )
             name = self._rewrite_spec_layer_name(spec_layer, name)
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 # Skip non-stacked layers and experts (experts handled below).
@@ -266,6 +279,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
+                if is_fusion_moe_shared_experts_layer:
+                    continue
                 name_mapped = name.replace(weight_name, param_name)
 
                 # QKV fusion is optional, fall back to normal
@@ -286,45 +301,105 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    name = name.replace(weight_name, param_name)
-
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(
-                        param,
-                        loaded_weight,
-                        name,
-                        shard_id=shard_id,
-                        expert_id=expert_id,
-                    )
-                    break
-                else:
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-
-                    name = maybe_remap_kv_scale_name(name, params_dict)
-                    if name is None:
-                        continue
-
-                    # According to DeepSeek-V3 Technical Report, MTP modules
-                    # shares embedding layer. We only load the first weights.
-                    if (
-                        spec_layer != self.model.mtp_start_layer_idx
-                        and ".layers" not in name
-                    ):
-                        continue
-
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
+                # Special handling: when AITER fusion_shared_experts is enabled,
+                # checkpoints may provide a single widened shared_experts tensor
+                # without explicit expert indices
+                # (e.g. ...mlp.shared_experts.gate_proj.weight).
+                # For models with multiple shared experts, split that tensor
+                # evenly into per-shared-expert slices and load them into
+                # appended expert slots mlp.experts.{n_routed_experts + j}.*
+                # accordingly.
+                num_chunks = 1
+                if is_fusion_moe_shared_experts_layer:
+                    num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
+                    # Determine split axis based on op type
+                    # gate/up: ColumnParallel → split along dim 0
+                    # down: RowParallel → split along dim 1
+                    split_dim = 1 if "down_proj.weight" in name else 0
+                    total = loaded_weight.shape[split_dim]
+                    assert total % num_chunks == 0, (
+                        f"Shared expert weight dim {total} "
+                        f"not divisible by num_chunks {num_chunks}"
                     )
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
+                    chunk_size = total // num_chunks
+
+                for j in range(num_chunks):
+                    chunk_name = name
+                    weight_to_load = loaded_weight
+
+                    if is_fusion_moe_shared_experts_layer:
+                        if split_dim == 0:
+                            weight_to_load = loaded_weight[
+                                j * chunk_size : (j + 1) * chunk_size, :
+                            ]
+                        else:
+                            weight_to_load = loaded_weight[
+                                :, j * chunk_size : (j + 1) * chunk_size
+                            ]
+                        # Synthesize an expert-style name so expert mapping
+                        # can route it
+                        chunk_name = name.replace(
+                            "mlp.shared_experts",
+                            f"mlp.experts.{self.config.n_routed_experts + j}",
+                        )
+
+                    # Use expert_params_mapping to locate the destination
+                    # param and delegate to its expert-aware weight_loader
+                    # with expert_id.
+                    for mapping in expert_params_mapping:
+                        param_name, weight_name, expert_id, shard_id = mapping
+                        if weight_name not in chunk_name:
+                            continue
+
+                        # Do not modify `name` since the loop may continue here
+                        # Instead, create a new variable
+                        name_mapped = chunk_name.replace(weight_name, param_name)
+
+                        param = params_dict[name_mapped]
+                        # We should ask the weight loader to return success or
+                        # not here since otherwise we may skip experts with
+                        # other available replicas.
+                        weight_loader = typing.cast(
+                            Callable[..., bool], param.weight_loader
+                        )
+                        success = weight_loader(
+                            param,
+                            weight_to_load,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
+                        if success:
+                            if not is_fusion_moe_shared_experts_layer:
+                                name = name_mapped
+                            else:
+                                loaded_params.add(name_mapped)
+                            break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+
+                        name = maybe_remap_kv_scale_name(name, params_dict)
+                        if name is None:
+                            continue
+
+                        # According to DeepSeek-V3 Technical Report, MTP modules
+                        # shares embedding layer. We only load the first weights.
+                        if (
+                            spec_layer != self.model.mtp_start_layer_idx
+                            and ".layers" not in name
+                        ):
+                            continue
+
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+            if not is_fusion_moe_shared_experts_layer:
+                loaded_params.add(name)
         return loaded_params
 
     def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py
index c89caab93a1e..8179f916ff41 100644
--- a/vllm/model_executor/models/deepseek_ocr.py
+++ b/vllm/model_executor/models/deepseek_ocr.py
@@ -161,7 +161,7 @@ def validate_params(cls, params: SamplingParams):
             )
 
     def is_argmax_invariant(self) -> bool:
-        return True
+        return False
 
     def new_req_logits_processor(
         self,
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 115818d903a6..73cac2556c55 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -27,15 +27,14 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import DeepseekV2Config, DeepseekV3Config
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.attention.ops.common import pack_seq_triton, unpack_seq_triton
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig, get_current_vllm_config
@@ -85,7 +84,7 @@
 )
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
 
-from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
+from .interfaces import MixtureOfExperts, SupportsEagle, SupportsLoRA, SupportsPP
 from .utils import (
     PPMissingLayer,
     is_pp_missing_parameter,
@@ -111,8 +110,6 @@ def __init__(
         config: DeepseekV2Config | DeepseekV3Config,
         hidden_size: int,
         num_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -139,7 +136,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -162,8 +158,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -292,7 +287,10 @@ def __init__(
         )
 
         self.is_rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled()
-        if config.n_shared_experts is None or self.is_rocm_aiter_moe_enabled:
+        self.is_fusion_moe_shared_experts_enabled = (
+            rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+        )
+        if config.n_shared_experts is None or self.is_fusion_moe_shared_experts_enabled:
             self.shared_experts = None
         else:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
@@ -332,7 +330,7 @@ def __init__(
             num_redundant_experts=self.n_redundant_experts,
             is_sequence_parallel=self.is_sequence_parallel,
             n_shared_experts=config.n_shared_experts
-            if rocm_aiter_ops.is_fusion_moe_shared_experts_enabled()
+            if self.is_fusion_moe_shared_experts_enabled
             else None,
         )
 
@@ -409,8 +407,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -430,7 +426,6 @@ def __init__(
         assert num_heads % tp_size == 0
         self.num_local_heads = num_heads // tp_size
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         assert topk_indices_buffer is None, (
             "topk_indices_buffer is not \
@@ -485,21 +480,20 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
-        if rope_scaling:
-            rope_scaling["rope_type"] = "deepseek_yarn"
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = "deepseek_yarn"
 
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
 
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
+        if config.rope_parameters["rope_type"] != "default":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
@@ -600,6 +594,7 @@ def sparse_attn_indexer(
 ) -> torch.Tensor:
     # careful! this will be None in dummy run
     attn_metadata = get_forward_context().attn_metadata
+    fp8_dtype = current_platform.fp8_dtype()
     # assert isinstance(attn_metadata, dict)
     if not isinstance(attn_metadata, dict):
         return sparse_attn_indexer_fake(
@@ -639,7 +634,7 @@ def sparse_attn_indexer(
             k_fp8 = torch.empty(
                 [chunk.total_seq_lens, head_dim],
                 device=k.device,
-                dtype=torch.float8_e4m3fn,
+                dtype=fp8_dtype,
             )
             k_scale = torch.empty(
                 [chunk.total_seq_lens, 4],
@@ -653,7 +648,12 @@ def sparse_attn_indexer(
                 chunk.block_table,
                 chunk.cu_seq_lens,
             )
-            logits = fp8_mqa_logits(
+            fp8_mqa_logits_func = fp8_mqa_logits
+            if current_platform.is_rocm():
+                from vllm.attention.ops.rocm_aiter_mla_sparse import rocm_fp8_mqa_logits
+
+                fp8_mqa_logits_func = rocm_fp8_mqa_logits
+            logits = fp8_mqa_logits_func(
                 q_fp8[chunk.token_start : chunk.token_end],
                 (k_fp8, k_scale.view(torch.float32)),
                 weights[chunk.token_start : chunk.token_end],
@@ -698,7 +698,14 @@ def sparse_attn_indexer(
         next_n = padded_q_fp8_decode_tokens.shape[1]
         assert batch_size == decode_metadata.seq_lens.shape[0]
         num_padded_tokens = batch_size * next_n
-        logits = fp8_paged_mqa_logits(
+        fp8_paged_mqa_logits_func = fp8_paged_mqa_logits
+        if current_platform.is_rocm():
+            from vllm.attention.ops.rocm_aiter_mla_sparse import (
+                rocm_fp8_paged_mqa_logits,
+            )
+
+            fp8_paged_mqa_logits_func = rocm_fp8_paged_mqa_logits
+        logits = fp8_paged_mqa_logits_func(
             padded_q_fp8_decode_tokens,
             kv_cache,
             weights[:num_padded_tokens],
@@ -755,7 +762,8 @@ def sparse_attn_indexer_fake(
     _flattened_kv = torch.empty(
         [total_seq_lens, head_dim + 4], device=k.device, dtype=torch.uint8
     )
-    _k_fp8 = _flattened_kv[..., :head_dim].view(torch.float8_e4m3fn).contiguous()
+    fp8_dtype = current_platform.fp8_dtype()
+    _k_fp8 = _flattened_kv[..., :head_dim].view(fp8_dtype).contiguous()
     _k_scale = _flattened_kv[..., head_dim:].view(torch.float32).contiguous()
     return topk_indices_buffer
 
@@ -846,8 +854,8 @@ def forward(
         )
 
         q_pe, k_pe = rotary_emb(positions, q_pe, k_pe.unsqueeze(1))
-        q = torch.cat([q_pe, q_nope], dim=-1)
-        k = torch.cat([k_pe.squeeze(1), k_nope], dim=-1)
+        q = torch.cat([q_pe.squeeze(0), q_nope], dim=-1)
+        k = torch.cat([k_pe.squeeze((0, 2)), k_nope], dim=-1)
 
         # we only quant q here since k quant is fused with cache insertion
         q = q.view(-1, self.head_dim)
@@ -903,8 +911,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -927,7 +933,6 @@ def __init__(
         self.num_local_heads = num_heads // tp_size
 
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if self.q_lora_rank is not None:
@@ -981,25 +986,31 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        if rope_scaling:
-            rope_scaling["rope_type"] = "deepseek_yarn"
+        if config.rope_parameters["rope_type"] != "default":
+            config.rope_parameters["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=False,
         )
-        if rope_scaling:
-            mscale_all_dim = rope_scaling.get("mscale_all_dim", False)
-            scaling_factor = rope_scaling["factor"]
+        if config.rope_parameters["rope_type"] != "default":
+            mscale_all_dim = config.rope_parameters.get("mscale_all_dim", False)
+            scaling_factor = config.rope_parameters["factor"]
             mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
             self.scaling = self.scaling * mscale * mscale
 
         self.is_v32 = hasattr(config, "index_topk")
 
         if self.is_v32:
+            self.indexer_rope_emb = get_rope(
+                qk_rope_head_dim,
+                rotary_dim=qk_rope_head_dim,
+                max_position=max_position_embeddings,
+                rope_parameters=config.rope_parameters,
+                is_neox_style=True,
+            )
             self.indexer = Indexer(
                 vllm_config,
                 config,
@@ -1011,6 +1022,7 @@ def __init__(
                 f"{prefix}.indexer",
             )
         else:
+            self.indexer_rope_emb = None
             self.indexer = None
 
         mla_modules = MLAModules(
@@ -1028,6 +1040,7 @@ def __init__(
             q_b_proj=self.q_b_proj if self.q_lora_rank is not None else None,
             q_proj=self.q_proj if self.q_lora_rank is None else None,
             indexer=self.indexer,
+            indexer_rotary_emb=self.indexer_rope_emb,
             is_sparse=self.is_v32,
             topk_indices_buffer=topk_indices_buffer,
         )
@@ -1073,8 +1086,6 @@ def __init__(
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         moe_layer_freq = getattr(config, "moe_layer_freq", 1)
         # DecoderLayers are created with `make_layers` which passes the prefix
@@ -1107,8 +1118,6 @@ def __init__(
             v_head_dim=v_head_dim,
             q_lora_rank=config.q_lora_rank if hasattr(config, "q_lora_rank") else None,
             kv_lora_rank=kv_lora_rank,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
@@ -1311,7 +1320,7 @@ def update_physical_experts_metadata(
 
 
 class DeepseekV2ForCausalLM(
-    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA
+    nn.Module, SupportsPP, DeepseekV2MixtureOfExperts, SupportsLoRA, SupportsEagle
 ):
     packed_modules_mapping = {
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -1470,8 +1479,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
             if spec_layer is not None:
                 continue  # skip spec decode layers for main model
 
-            is_fuse_shared_experts_layer = rocm_aiter_moe_shared_expert_enabled and (
-                "mlp.shared_experts" in name
+            is_fusion_moe_shared_experts_layer = (
+                rocm_aiter_moe_shared_expert_enabled and ("mlp.shared_experts" in name)
             )
 
             for param_name, weight_name, shard_id in stacked_params_mapping:
@@ -1486,7 +1495,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 # for mlp.experts[0].gate_gate_up_proj, which breaks load.
                 if ("mlp.experts." in name) and name not in params_dict:
                     continue
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     continue
                 name_mapped = name.replace(weight_name, param_name)
 
@@ -1522,7 +1531,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 # appended expert slots mlp.experts.{n_routed_experts + j}.*
                 # accordingly.
                 num_chunks = 1
-                if is_fuse_shared_experts_layer:
+                if is_fusion_moe_shared_experts_layer:
                     num_chunks = getattr(self.config, "n_shared_experts", 1) or 1
                     # Determine split axis based on op type
                     # gate/up: ColumnParallel → split along dim 0
@@ -1539,7 +1548,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                     chunk_name = name
                     weight_to_load = loaded_weight
 
-                    if is_fuse_shared_experts_layer:
+                    if is_fusion_moe_shared_experts_layer:
                         if split_dim == 0:
                             weight_to_load = loaded_weight[
                                 j * chunk_size : (j + 1) * chunk_size, :
@@ -1590,7 +1599,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                             return_success=True,
                         )
                         if success:
-                            if not is_fuse_shared_experts_layer:
+                            if not is_fusion_moe_shared_experts_layer:
                                 name = name_mapped
                             else:
                                 loaded_params.add(name_mapped)
@@ -1619,7 +1628,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                             param, "weight_loader", default_weight_loader
                         )
                         weight_loader(param, loaded_weight)
-            if not is_fuse_shared_experts_layer:
+            if not is_fusion_moe_shared_experts_layer:
                 loaded_params.add(name)
 
         return loaded_params
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index e7b48e0f4e55..1b6e4110039c 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -48,7 +48,6 @@
 )
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collection_utils import is_list_of
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_dtype
 
@@ -595,19 +594,9 @@ def _pixel_values_to_embedding(
 
     def _process_image_input(
         self, image_input: DeepseekVL2ImageInputs
-    ) -> list[torch.Tensor]:
+    ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            image_data = image_input["data"]
-            if is_list_of(image_data, torch.Tensor):
-                # it's already a list of tensors
-                return image_data
-            if len(image_data.shape) == 3:
-                # 3D tensor
-                return list(torch.unbind(image_data, dim=0))
-            raise ValueError(
-                "We expect batched 2D tensors; "
-                "this can be either a list of 2D tensors or a single 3D tensor."
-            )
+            return image_input["data"]
 
         pixel_values = image_input["data"]
         images_spatial_crop = image_input["images_spatial_crop"]
diff --git a/vllm/model_executor/models/dots1.py b/vllm/model_executor/models/dots1.py
index d24da0c42a25..3beee9f86463 100644
--- a/vllm/model_executor/models/dots1.py
+++ b/vllm/model_executor/models/dots1.py
@@ -27,13 +27,12 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import Dots1Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
@@ -182,13 +181,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(-1, hidden_dim)
 
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = (
-            self.experts(hidden_states=hidden_states, router_logits=router_logits)
-            * self.routed_scaling_factor
-        )
 
+        shared_out, routed_out = self.experts(
+            hidden_states=hidden_states, router_logits=router_logits
+        )
         if self.shared_experts is not None:
-            final_hidden_states = final_hidden_states[0] + final_hidden_states[1]
+            final_hidden_states = (routed_out + shared_out) * self.routed_scaling_factor
+        else:
+            final_hidden_states = routed_out * self.routed_scaling_factor
 
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
@@ -202,8 +202,6 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         config: Dots1Config,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -229,7 +227,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         attention_bias = config.attention_bias
 
@@ -255,8 +252,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -296,8 +292,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         layer_idx = int(prefix.split(sep=".")[-1])
         self.layer_idx = layer_idx
@@ -307,8 +301,6 @@ def __init__(
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             config=config,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/dots_ocr.py b/vllm/model_executor/models/dots_ocr.py
index 25e5588961a6..5cc2a48f26d6 100644
--- a/vllm/model_executor/models/dots_ocr.py
+++ b/vllm/model_executor/models/dots_ocr.py
@@ -11,7 +11,6 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -22,6 +21,7 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -39,8 +39,8 @@
 )
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM
-from vllm.model_executor.models.qwen2_5_vl import Qwen2_5_VisionAttention
 from vllm.model_executor.models.qwen2_vl import (
+    Qwen2VisionAttention,
     Qwen2VLDummyInputsBuilder,
     Qwen2VLMultiModalProcessor,
     Qwen2VLProcessingInfo,
@@ -293,19 +293,16 @@ def __init__(
             torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -323,12 +320,11 @@ def forward(
         rotary_pos_emb: torch.Tensor | None = None,
         *,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         # [S, C] -> [S, B=1, C]
         x = hidden_states.unsqueeze(1)
         x, _ = self.qkv(x)
-        q, k, v = Qwen2_5_VisionAttention.split_qkv(self, x)
+        q, k, v = Qwen2VisionAttention.split_qkv(self, x)
         bs = q.shape[1]
         # [S,B,H,D] -> [B,S,H,D]
         q = q.permute(1, 0, 2, 3).contiguous()
@@ -373,16 +369,6 @@ def forward(
                 out_i = out_i.permute(0, 2, 1, 3)
                 outputs.append(out_i)
             context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
         else:
             raise RuntimeError("Unsupported attention backend")
 
@@ -471,7 +457,7 @@ def __init__(self, config):
         self.temporal_patch_size = config.temporal_patch_size
         self.embed_dim = config.embed_dim
         self.config = config
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             config.num_channels,
             config.embed_dim,
             kernel_size=(config.patch_size, config.patch_size),
@@ -544,14 +530,12 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,
-        seqlens: list[int] | None = None,
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -582,11 +566,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
         self.out_hidden_size = config.hidden_size
         # Keep blocks for compatibility with other vision towers
         num_layers = (
@@ -662,18 +641,14 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: list[list[int]]
@@ -693,14 +668,13 @@ def forward(
         )
         cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
 
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         for blk in self.blocks:
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         if self.post_trunk_norm is not None:
@@ -780,6 +754,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             architectures=["Qwen2ForCausalLM"],
         )
 
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
     def _parse_and_validate_image_input(
         self, **kwargs: object
     ) -> DotsOCRImageInputs | None:
diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py
index f2999968669f..278ba45e9684 100644
--- a/vllm/model_executor/models/ernie45_moe.py
+++ b/vllm/model_executor/models/ernie45_moe.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -62,6 +62,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (
@@ -232,9 +233,8 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         head_dim: int | None = None,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
         qkv_bias: bool = False,
@@ -266,7 +266,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -291,9 +290,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=False,
-            rope_scaling=rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -333,16 +331,14 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 500000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=500000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         self.self_attn = Ernie4_5_MoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             head_dim=getattr(config, "head_dim", None),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "use_bias", False),
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
index daa5bf03ea4a..81663dd7bbb4 100644
--- a/vllm/model_executor/models/ernie45_vl.py
+++ b/vllm/model_executor/models/ernie45_vl.py
@@ -38,7 +38,6 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -201,12 +200,9 @@ def __init__(
             attn_backend_override=attn_backend_override,
         )
 
-        self.use_upstream_fa = False
-
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -214,7 +210,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -259,7 +254,6 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -311,20 +305,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -404,14 +384,12 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states),
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
         return hidden_states
@@ -516,11 +494,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -562,18 +535,14 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
         return rotary_pos_emb
 
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, num_pad=0
@@ -598,8 +567,8 @@ def forward(
         if hidden_states.ndim == 2:
             hidden_states = hidden_states.unsqueeze(dim=1)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         for i, blk in enumerate(self.blocks):
             hidden_states = blk(
@@ -607,7 +576,6 @@ def forward(
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         final_output = self.ln(hidden_states)
diff --git a/vllm/model_executor/models/ernie45_vl_moe.py b/vllm/model_executor/models/ernie45_vl_moe.py
index e8ef86f9b7f0..72f9957fc882 100644
--- a/vllm/model_executor/models/ernie45_vl_moe.py
+++ b/vllm/model_executor/models/ernie45_vl_moe.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 
 # from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -58,6 +58,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .ernie45_moe import Ernie4_5_MoeMLP
 from .interfaces import SupportsPP
@@ -91,9 +92,8 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         head_dim: int | None = None,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
         freq_allocation: int = 20,
         max_position_embeddings: int = 131072,
         rms_norm_eps: float = 1e-05,
@@ -126,7 +126,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -155,7 +154,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=self.head_dim,
             max_position_embeddings=max_position_embeddings,
-            base=rope_theta,
+            base=rope_parameters["rope_theta"],
             is_neox_style=False,
             dtype=torch.get_default_dtype(),
             mrope_section=[h_rope, w_rope, t_rope],
@@ -413,8 +412,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 500000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=500000)
         freq_allocation = getattr(config, "freq_allocation", 20)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
 
@@ -423,8 +421,7 @@ def __init__(
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             head_dim=getattr(config, "head_dim", None),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             freq_allocation=freq_allocation,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 6c56bfc433c7..acf651ed2498 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -27,13 +27,12 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -113,8 +112,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -144,7 +141,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -173,8 +169,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
@@ -207,8 +202,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -221,8 +214,6 @@ def __init__(
             hidden_size=hidden_size,
             num_heads=num_heads,
             num_kv_heads=num_kv_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=bias,
@@ -251,14 +242,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -272,8 +255,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
@@ -476,7 +457,6 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "wte": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py
index b89e168ada20..cb710a7ec5cf 100644
--- a/vllm/model_executor/models/exaone4.py
+++ b/vllm/model_executor/models/exaone4.py
@@ -23,13 +23,12 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import Exaone4Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -52,6 +51,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
@@ -110,8 +110,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 1000000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -141,7 +139,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,12 +173,12 @@ def __init__(
         # apply rotary embeddings to every layer in full attention models
         self.apply_rope_all_layers = "sliding_attention" not in config.layer_types
 
+        set_default_rope_theta(config, default_theta=1000000)
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
         self.attn = Attention(
@@ -227,14 +224,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -249,8 +238,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
@@ -463,7 +450,6 @@ class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 85acdff3d96b..32d9e7b92559 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -30,7 +30,7 @@
 from torch.nn import LayerNorm
 from transformers import FalconConfig as HF_FalconConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -164,13 +164,12 @@ def __init__(
         )
 
         if self.use_rotary:
-            rope_theta = getattr(config, "rope_theta", 10000)
             max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
             self.rotary_emb = get_rope(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=rope_theta,
+                rope_parameters=config.rope_parameters,
             )
             self.attn = Attention(
                 self.num_heads,
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
index 3653425b8e1c..83ceb9303cfb 100644
--- a/vllm/model_executor/models/falcon_h1.py
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -35,6 +35,7 @@
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import (
     HasInnerState,
@@ -198,10 +199,8 @@ def forward(
         residual: torch.Tensor | None,
         **kwargs,
     ):
-        output = torch.empty_like(hidden_states)
-        self.mamba(
+        output = self.mamba(
             hidden_states,
-            output,
             mup_vector=self.mup_vector,
         )
         return output, residual
@@ -216,8 +215,7 @@ def __init__(
         prefix: str = "",
     ) -> None:
         super().__init__()
-        rope_theta = getattr(config, "rope_theta", 1e11)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1e11)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.hidden_size = config.hidden_size
         tp_size = get_tensor_model_parallel_world_size()
@@ -242,7 +240,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         if hasattr(config, "partial_rotary_factor"):
@@ -256,8 +253,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            rope_scaling=rope_scaling,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
             dtype=None,  # see impl of get_rope
         )
@@ -514,7 +510,6 @@ class FalconH1ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 7aaae7c503b5..dd5a74c8ed00 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -20,12 +20,13 @@
 from collections.abc import Iterable
 from functools import cache
 from itertools import islice
+from typing import Any
 
 import torch
 from torch import nn
 from transformers import GemmaConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -127,8 +128,8 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         head_dim: int,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 8192,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -153,7 +154,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -176,7 +176,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -218,7 +218,7 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
             max_position_embeddings=config.max_position_embeddings,
-            rope_theta=config.rope_theta,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 4d5d6cbb37c6..cb36e0482458 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -23,7 +23,7 @@
 from torch import nn
 from transformers import Gemma2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -107,7 +107,6 @@ def __init__(
         num_kv_heads: int,
         head_dim: int,
         max_position_embeddings: int,
-        rope_theta: float,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         attn_logits_soft_cap: float | None = None,
@@ -134,7 +133,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = config.query_pre_attn_scalar**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -156,7 +154,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
 
@@ -206,7 +204,6 @@ def __init__(
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
             max_position_embeddings=config.max_position_embeddings,
-            rope_theta=config.rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
             attn_logits_soft_cap=config.attn_logit_softcapping,
diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py
index 357e61a4e78b..73176eba95ed 100644
--- a/vllm/model_executor/models/gemma3.py
+++ b/vllm/model_executor/models/gemma3.py
@@ -23,7 +23,8 @@
 from torch import nn
 from transformers import Gemma3TextConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -155,25 +156,30 @@ def __init__(
         self.k_norm = GemmaRMSNorm(self.head_dim, eps=config.rms_norm_eps)
 
         layer_idx = extract_layer_index(prefix)
-        self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        layer_type = config.layer_types[layer_idx]
+        self.is_sliding = layer_type == "sliding_attention"
         sliding_window = config.sliding_window if self.is_sliding else None
 
         # Initialize the rotary embedding.
-        if self.is_sliding:
-            # Local attention. Override the values in config.json.
-            self.rope_theta = config.rope_local_base_freq
-            self.rope_scaling = {"rope_type": "default"}
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
         else:
+            # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            self.rope_theta = config.rope_theta
-            self.rope_scaling = config.rope_scaling
+            rope_parameters = config.rope_parameters
+            # Local attention. Override the values in config.json.
+            if self.is_sliding:
+                rope_parameters = dict(
+                    rope_type="default", rope_theta=config.rope_local_base_freq
+                )
+
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=self.rope_scaling,
         )
 
         if getattr(config, "is_causal", True):
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 02fb7ef31dc9..43c69e5e1399 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -603,6 +603,26 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
 
         return self._process_image_input(image_input)
 
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+        handle_oov_mm_token: bool = True,
+    ) -> torch.Tensor:
+        # Early return for text-only inference (no multimodal data)
+        if multimodal_embeddings is None or is_multimodal is None:
+            return super().embed_input_ids(input_ids)
+
+        # Use interface default with OOV handling enabled
+        return super().embed_input_ids(
+            input_ids,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -624,69 +644,6 @@ def forward(
 
         return hidden_states
 
-    def prepare_attn_masks(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        mask_dtype: torch.dtype,
-        **kwargs,
-    ):
-        kwargs["has_images"] = True
-        # NOTE(woosuk): Here, we distinguish the sequences by the position id 0.
-        # This is a HACK. Fix this.
-        start_indices = (positions == 0).cpu().nonzero()
-        num_seqs = len(start_indices)
-        seq_lens = []
-        for i in range(num_seqs):
-            start_idx = start_indices[i].item()
-            if i < num_seqs - 1:
-                end_idx = start_indices[i + 1].item()
-            else:
-                end_idx = len(input_ids)
-            seq_lens.append(end_idx - start_idx)
-        kwargs["seq_lens"] = seq_lens
-
-        global_attn_masks = []
-        local_attn_masks = []
-        start_idx = 0
-        for seq_len in seq_lens:
-            end_idx = start_idx + seq_len
-            input_token_ids = input_ids[start_idx:end_idx]
-            start_idx = end_idx
-            # Create a global causal mask.
-            global_attn_mask = torch.empty(
-                1,
-                1,
-                seq_len,
-                seq_len,
-                dtype=mask_dtype,
-                device=input_ids.device,
-            )
-            global_attn_mask.fill_(float("-inf"))
-            # Fill the lower triangle with 0.
-            global_attn_mask = global_attn_mask.triu(diagonal=1)
-
-            # Consider the bidirectional attention between image tokens.
-            img_mask = torch.zeros_like(global_attn_mask)
-            img_pos = input_token_ids == self.config.image_token_index
-            img_mask[:, :, :, img_pos] += 1
-            img_mask[:, :, img_pos, :] += 1
-            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
-            global_attn_masks.append(global_attn_mask)
-
-            sliding_window = self.config.text_config.sliding_window
-            if sliding_window is not None:
-                # Create a local causal mask with sliding window (1024).
-                local_attn_mask = torch.ones_like(global_attn_mask)
-                local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
-                local_attn_mask = torch.where(
-                    local_attn_mask == 0, global_attn_mask, float("-inf")
-                )
-                local_attn_masks.append(local_attn_mask)
-        kwargs["global_attn_masks"] = global_attn_masks
-        kwargs["local_attn_masks"] = local_attn_masks
-        return kwargs
-
     def compute_logits(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py
index 64443190f53e..f4427c9fd1d1 100644
--- a/vllm/model_executor/models/gemma3n.py
+++ b/vllm/model_executor/models/gemma3n.py
@@ -21,7 +21,7 @@
 from torch import nn
 from transformers.models.gemma3n.configuration_gemma3n import Gemma3nTextConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -332,18 +332,21 @@ def __init__(
         )
 
         layer_idx = extract_layer_index(prefix)
-        is_sliding = config.layer_types[layer_idx] == "sliding_attention"
+        layer_type = config.layer_types[layer_idx]
+        is_sliding = layer_type == "sliding_attention"
         self.sliding_window = config.sliding_window if is_sliding else None
 
         # Initialize the rotary embedding.
-        if is_sliding:
-            # Local attention. Override the values in config.json.
-            rope_theta = config.rope_local_base_freq
-            rope_scaling = {"rope_type": "default"}
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
         else:
+            # Transformers v4 rope config.
             # Global attention. Use the values in config.json.
-            rope_theta = config.rope_theta
-            rope_scaling = config.rope_scaling
+            rope_parameters = config.rope_parameters.copy()
+            # Local attention. Override the values in config.json.
+            if is_sliding:
+                rope_parameters["rope_theta"] = config.rope_local_base_freq
 
         first_kv_shared_layer_idx = (
             config.num_hidden_layers - config.num_kv_shared_layers
@@ -383,9 +386,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=rope_scaling,
         )
 
         self.attn = Attention(
diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py
index faa0674a2e43..002cdb721e1d 100644
--- a/vllm/model_executor/models/glm4.py
+++ b/vllm/model_executor/models/glm4.py
@@ -29,7 +29,8 @@
 from torch import nn
 from transformers import Glm4Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -57,10 +58,8 @@ def __init__(
         max_position: int = 4096 * 32,
         head_dim: int | None = None,
         qkv_bias: bool = False,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -86,7 +85,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
             self.head_dim,
@@ -107,8 +105,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.rotary_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=partial_rotary_factor,
             is_neox_style=False,
         )
@@ -150,8 +147,6 @@ def __init__(
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
 
         self.self_attn = Glm4Attention(
             config=config,
@@ -159,12 +154,10 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             qkv_bias=getattr(config, "attention_bias", False),
             head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
             attn_type=AttentionType.DECODER,
         )
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
index b2d4fe0c0139..5ba3c0a35928 100644
--- a/vllm/model_executor/models/glm4_1v.py
+++ b/vllm/model_executor/models/glm4_1v.py
@@ -37,7 +37,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from transformers import BatchFeature
+from transformers import BatchFeature, Glm4vProcessor
 from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
 from transformers.models.glm4v.image_processing_glm4v import (
     Glm4vImageProcessor,
@@ -47,24 +47,22 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import (
-    check_upstream_fa_availability,
-    maybe_get_vit_flash_attn_backend,
-)
+from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
+from vllm.model_executor.layers.conv import Conv2dLayer, Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
     QKVParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -103,7 +101,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -296,12 +293,10 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -309,7 +304,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -342,9 +336,9 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
@@ -354,10 +348,12 @@ def forward(
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v))
-        if rotary_pos_emb is not None:
+        if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
             # [2 * b, s, heads, head_dim]
             qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
+            qk_rotated = apply_rotary_pos_emb_vision(
+                qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+            )
             q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
@@ -397,20 +393,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -455,16 +437,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -486,15 +468,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -658,44 +643,6 @@ def forward(
         return embeddings
 
 
-class Glm4vVisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0,
-                        self.dim,
-                        2,
-                        dtype=torch.float,
-                        device=self.inv_freq.device,
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Glm4vVisionTransformer(nn.Module):
     def __init__(
         self,
@@ -729,7 +676,12 @@ def __init__(
 
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Glm4vVisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            is_neox_style=True,
+        )
         self.blocks = nn.ModuleList(
             [
                 Glm4vVisionBlock(
@@ -758,7 +710,7 @@ def __init__(
         self.post_conv_layernorm = RMSNorm(
             vision_config.hidden_size, eps=vision_config.rms_norm_eps
         )
-        self.downsample = nn.Conv2d(
+        self.downsample = Conv2dLayer(
             in_channels=vision_config.hidden_size,
             out_channels=vision_config.out_hidden_size,
             kernel_size=vision_config.spatial_merge_size,
@@ -773,11 +725,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -787,7 +734,9 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+    def rot_pos_emb(
+        self, grid_thw: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         pos_ids = []
         for t, h, w in grid_thw:
             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
@@ -815,22 +764,25 @@ def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb, pos_ids
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+        return cos_combined, sin_combined, pos_ids
 
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+    ) -> int | None:
+        max_seqlen = None
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -846,15 +798,18 @@ def forward(
         x = self.post_conv_layernorm(x)
 
         # compute position embedding
-        rotary_pos_emb, image_type_ids = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb_cos, rotary_pos_emb_sin, image_type_ids = self.rot_pos_emb(
+            grid_thw
+        )
         # compute cu_seqlens
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
         ).cumsum(dim=0, dtype=torch.int32)
         cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0)
 
-        # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        # pre-compute max_seqlen for attn mask to reduce cuMemcpy operations
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         x = self.embeddings(
             x, seqlens, grid_thw, image_type_ids[:, 0], image_type_ids[:, 1]
         )
@@ -865,9 +820,9 @@ def forward(
             x = blk(
                 x,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
@@ -893,9 +848,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -914,12 +866,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 
 class Glm4vProcessingInfo(BaseProcessingInfo):
-    def get_hf_config(self):
-        return self.ctx.get_hf_config()
-
-    def get_tokenizer(self):
-        return self.ctx.tokenizer
-
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
         return {"image": None, "video": 1}
 
@@ -1047,7 +993,7 @@ def get_num_frames_with_most_features(
 
         return max(max_frames_per_video, 1)
 
-    def _get_video_second_idx(
+    def _get_video_second_idx_glm4v(
         self, metadata: dict[str, Any], total_frames: int
     ) -> list[int]:
         video_processor = self.get_video_processor()
@@ -1098,6 +1044,83 @@ def _get_video_second_idx(
             selected_timestamps.append(timestamps_list[idx])
         return selected_timestamps
 
+    def _get_video_second_idx_glm46v(
+        self, metadata: dict[str, Any], total_frames: int
+    ) -> list[int]:
+        video_processor = self.get_video_processor()
+
+        video_fps = metadata["fps"]
+        meta_frames = metadata.get("total_num_frames", total_frames)
+        max_frame_idx = meta_frames - 1
+        duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)
+
+        do_sample_frames = metadata.get("do_sample_frames", True)
+        if not do_sample_frames:
+            frame_indices = metadata["frames_indices"]
+        else:
+            DYNAMIC_FPS_THRES = {30: 3, 300: 1, 2400: 0.5}
+            MAX_FRAME_COUNT_DYNAMIC = 640
+            MAX_DURATION = 2400
+
+            effective_duration = min(duration, MAX_DURATION)
+            if effective_duration <= 30:
+                target_fps = DYNAMIC_FPS_THRES[30]
+            elif effective_duration <= 300:
+                target_fps = DYNAMIC_FPS_THRES[300]
+            else:
+                target_fps = DYNAMIC_FPS_THRES[2400]
+
+            temporal_patch_size = getattr(video_processor, "temporal_patch_size", 1)
+            extract_t = int(effective_duration * target_fps * temporal_patch_size)
+            extract_t = min(extract_t, MAX_FRAME_COUNT_DYNAMIC)
+
+            duration_per_frame = 1 / video_fps
+            timestamps = [i * duration_per_frame for i in range(meta_frames)]
+            max_second = int(duration)
+
+            if meta_frames < extract_t:
+                frame_indices = np.linspace(
+                    0, meta_frames - 1, extract_t, dtype=int
+                ).tolist()
+            else:
+                frame_indices = []
+                current_second = 0.0
+                inv_fps = 1 / (temporal_patch_size * target_fps)
+                for frame_index in range(meta_frames):
+                    if timestamps[frame_index] >= current_second:
+                        current_second += inv_fps
+                        frame_indices.append(frame_index)
+                        if current_second >= max_second:
+                            break
+
+            if len(frame_indices) < extract_t:
+                if len(frame_indices) == 0:
+                    start, end = 0, max(meta_frames - 1, 0)
+                else:
+                    start, end = frame_indices[0], frame_indices[-1]
+                frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist()
+            elif len(frame_indices) > extract_t:
+                frame_indices = np.linspace(
+                    0, meta_frames - 1, extract_t, dtype=int
+                ).tolist()
+
+        seen, uniq = set(), []
+        for idx in frame_indices:
+            if idx not in seen:
+                seen.add(idx)
+                uniq.append(idx)
+
+        if len(uniq) & 1:
+            uniq.append(uniq[-1])
+
+        frame_indices = uniq
+        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
+        timestamps_list = full_second_idxs[::2]
+        selected_timestamps = []
+        for idx in range(len(timestamps_list)):
+            selected_timestamps.append(timestamps_list[idx])
+        return selected_timestamps
+
     def _construct_video_placeholder(
         self,
         video_array: np.ndarray,
@@ -1116,9 +1139,18 @@ def _construct_video_placeholder(
         merge_length = image_processor.merge_size**2
 
         assert isinstance(grid_thw, torch.Tensor)
-        timestamps = self._get_video_second_idx(metadata, len(video_array))
+        timestamps = (
+            self._get_video_second_idx_glm4v(metadata, len(video_array))
+            if isinstance(hf_processor, Glm4vProcessor)
+            else self._get_video_second_idx_glm46v(metadata, len(video_array))
+        )
+
+        timestamp_format = (
+            "{}" if isinstance(hf_processor, Glm4vProcessor) else "{:.1f} seconds"
+        )
         frames_idx_token = [
-            tokenizer.encode(str(i), add_special_tokens=False) for i in timestamps
+            tokenizer.encode(timestamp_format.format(i), add_special_tokens=False)
+            for i in timestamps
         ]
         T, H, W = grid_thw
         num_tokens_per_frame = int(H * W) // merge_length
diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py
index 1422dbe9b3cd..c99f824e1bd4 100644
--- a/vllm/model_executor/models/glm4_moe.py
+++ b/vllm/model_executor/models/glm4_moe.py
@@ -26,13 +26,12 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers.models.glm4_moe import Glm4MoeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -233,8 +232,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 131072,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-05,
@@ -264,7 +261,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = use_qk_norm
 
@@ -291,8 +287,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=partial_rotary_factor,
         )
         self.attn = Attention(
@@ -341,8 +336,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 131072)
         # DecoderLayers are created with `make_layers` which passes the prefix
         # with the layer's index.
@@ -354,8 +347,6 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             head_dim=config.head_dim,
             rms_norm_eps=config.rms_norm_eps,
diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py
index 110ed0a64633..e34ae6c85a4f 100644
--- a/vllm/model_executor/models/glm4_moe_mtp.py
+++ b/vllm/model_executor/models/glm4_moe_mtp.py
@@ -256,13 +256,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
-        spec_layer = self.model.mtp_start_layer_idx
         for name, loaded_weight in weights:
             if name == "lm_head.weight":
-                name = f"model.layers.{spec_layer}.shard_head.head.weight"
+                spec_layer = self.model.mtp_start_layer_idx
+                name = f"model.layers.{spec_layer}.shared_head.head.weight"
             elif name == "model.embed_tokens.weight":
-                # This name is same with local model, rewriting is not needed.
-                pass
+                spec_layer = self.model.mtp_start_layer_idx
             else:
                 spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
                 if spec_layer is None:
diff --git a/vllm/model_executor/models/glm4v.py b/vllm/model_executor/models/glm4v.py
index 1c18ea0745f2..514082cf60ce 100644
--- a/vllm/model_executor/models/glm4v.py
+++ b/vllm/model_executor/models/glm4v.py
@@ -24,6 +24,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
@@ -78,7 +79,7 @@ class GLMVImagePixelInputs(TensorSchema):
 class EVA2CLIPPatchEmbedding(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             config.in_channels,
             config.hidden_size,
             kernel_size=config.patch_size,
@@ -333,7 +334,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.linear_proj",
         )
-        self.conv = nn.Conv2d(
+        self.conv = Conv2dLayer(
             in_channels=vision_config.hidden_size,
             out_channels=config.hidden_size,
             kernel_size=2,
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index a5e8131c7fba..da5d48a94ff3 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -27,7 +27,7 @@
 from torch import nn
 from transformers import GPT2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed.parallel_state import (
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index cdf038ba25c9..a405fd184513 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -28,7 +28,7 @@
 from torch import nn
 from transformers import GPTBigCodeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index e416ecde0c1e..f0a34c47da54 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -26,7 +26,7 @@
 from torch import nn
 from transformers import GPTJConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -95,13 +95,12 @@ def __init__(
         scaling = self.head_size**-0.5
         assert getattr(config, "rotary", True)
         assert config.rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=config.rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=False,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index af0c9209231c..b9959682cbce 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -26,7 +26,7 @@
 from torch import nn
 from transformers import GPTNeoXConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -92,13 +92,12 @@ def __init__(
         scaling = self.head_size**-0.5
         rotary_dim = int(self.head_size * config.rotary_pct)
         assert rotary_dim % 2 == 0
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
index 692ef605fe17..cff16b7a7a8c 100644
--- a/vllm/model_executor/models/gpt_oss.py
+++ b/vllm/model_executor/models/gpt_oss.py
@@ -7,12 +7,14 @@
 from torch import nn
 from transformers import GptOssConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
     get_dp_group,
     get_ep_group,
+    get_pcp_group,
     get_pp_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
@@ -67,16 +69,17 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
             dtype=torch.float32,
-            rope_scaling={
+            rope_parameters={
+                "rope_theta": config.rope_parameters["rope_theta"],
                 "rope_type": "yarn",
-                "factor": config.rope_scaling["factor"],
-                "original_max_position_embeddings": config.rope_scaling[
+                "factor": config.rope_parameters["factor"],
+                "original_max_position_embeddings": config.rope_parameters[
                     "original_max_position_embeddings"
                 ],
-                "beta_fast": config.rope_scaling["beta_fast"],
-                "beta_slow": config.rope_scaling["beta_slow"],
+                "beta_fast": config.rope_parameters["beta_fast"],
+                "beta_slow": config.rope_parameters["beta_slow"],
+                "truncate": config.rope_parameters.get("truncate", True),
             },
             is_neox_style=True,
         )
@@ -90,7 +93,6 @@ def __init__(
         self.q_size = self.num_attention_heads * self.head_dim // tp_size
         self.kv_size = self.num_key_value_heads * self.head_dim // tp_size
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = config.rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.hidden_size,
@@ -323,10 +325,12 @@ def _load_weights_mxfp4(
 
         # In MoE, we need to flatten the tensor parallel size across the data
         # parallel size when EP is disabled.
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
             tp_size=get_tensor_model_parallel_world_size(),
             dp_size=get_dp_group().world_size,
             dp_rank=get_dp_group().rank_in_group,
+            pcp_size=get_pcp_group().world_size,
+            pcp_rank=get_pcp_group().rank_in_group,
         )
 
         intermediate_size = self.config.intermediate_size
@@ -494,8 +498,8 @@ def _load_weights_mxfp4(
 
     def _load_weights_other(
         self,
-        ep_rank_start: int,
         ep_rank_end: int,
+        ep_rank_start: int,
         heads_per_rank: int,
         head_start: int,
         weights: Iterable[tuple[str, torch.Tensor]],
@@ -508,10 +512,12 @@ def _load_weights_other(
 
         # In MoE, we need to flatten the tensor parallel size across the data
         # parallel size when EP is disabled.
-        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp(
+        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
             tp_size=get_tensor_model_parallel_world_size(),
             dp_size=get_dp_group().world_size,
             dp_rank=get_dp_group().rank_in_group,
+            pcp_size=get_pcp_group().world_size,
+            pcp_rank=get_pcp_group().rank_in_group,
         )
 
         intermediate_size = self.config.intermediate_size
@@ -651,6 +657,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
 
 
 class GptOssForCausalLM(nn.Module, SupportsPP, SupportsEagle3, SupportsLoRA):
+    is_3d_moe_weight: bool = True
     packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]}
 
     hf_to_vllm_mapper = WeightsMapper(
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index c44b4021471e..76519c4660f1 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -26,13 +26,12 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import GraniteConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -47,7 +46,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -112,8 +110,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -143,7 +139,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = config.attention_multiplier
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -167,8 +162,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -204,14 +198,6 @@ def __init__(
         super().__init__()
         self.hidden_size = config.hidden_size
         self.residual_multiplier = config.residual_multiplier
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -225,8 +211,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
@@ -276,29 +260,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
-                self.vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
             )
         else:
@@ -429,34 +400,23 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = GraniteModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=DEFAULT_VOCAB_PADDING_SIZE
-                # We need bigger padding if using lora for kernel
-                # compatibility
-                if not lora_config
-                else lora_config.lora_vocab_padding_size,
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -468,7 +428,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 logit_scale /= config.logits_scaling
 
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, scale=logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 5c6759ded066..b038400a1262 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -31,7 +31,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -141,8 +141,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         attention_multiplier: float | None = None,
@@ -172,7 +171,6 @@ def __init__(
             if attention_multiplier is not None
             else self.head_dim**-1
         )
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -194,9 +192,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -235,16 +232,12 @@ def __init__(
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.self_attn = GraniteMoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -504,7 +497,6 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py
index 05177f1d1ac2..1d9c2f5df4a5 100644
--- a/vllm/model_executor/models/granitemoehybrid.py
+++ b/vllm/model_executor/models/granitemoehybrid.py
@@ -115,8 +115,7 @@ def forward(
     ):
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states)
-        output = torch.empty_like(hidden_states)
-        self.mamba(hidden_states, output)
+        output = self.mamba(hidden_states)
         hidden_states = residual + output * self.residual_multiplier
 
         residual = hidden_states
@@ -274,10 +273,7 @@ def __init__(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=config.max_position_embeddings,
-                base=int(config.rope_theta),
-                rope_scaling=config.rope_scaling
-                if hasattr(config, "rope_scaling") and config.rope_scaling is not None
-                else None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
         else:
@@ -605,7 +601,6 @@ class GraniteMoeHybridForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py
index 926c539af33b..8ad5a7105bb5 100644
--- a/vllm/model_executor/models/granitemoeshared.py
+++ b/vllm/model_executor/models/granitemoeshared.py
@@ -84,16 +84,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.self_attn = GraniteMoeAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -267,7 +263,6 @@ class GraniteMoeSharedForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 9dc231863f74..6f62a1d11e52 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -25,12 +25,13 @@
 
 from collections.abc import Iterable
 from itertools import islice
+from typing import Any
 
 import torch
 import torch.nn.functional as F
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -134,7 +135,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -161,7 +162,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -183,7 +183,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
         )
 
@@ -234,15 +234,12 @@ def __init__(
             if not self.use_fp8 and hasattr(quant_config, "is_fp8"):
                 self.use_fp8 = quant_config.is_fp8
 
-        # Requires transformers > 4.32.0
-        # Default rope_theta value if not in config
-        rope_theta = 10000
         self.attn = Grok1Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
+            rope_parameters=getattr(config, "rope_parameters", None),
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 81c6b34bd6ce..6276c3d67541 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -28,7 +28,7 @@
     PromptUpdate,
     PromptUpdateDetails,
 )
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 from .intern_vit import InternVisionModel
 from .internvl import (
@@ -241,7 +241,7 @@ class H2OVLProcessor(BaseInternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py
index 1eadcbe67ade..ccdfa3fe175f 100644
--- a/vllm/model_executor/models/hunyuan_v1.py
+++ b/vllm/model_executor/models/hunyuan_v1.py
@@ -27,14 +27,14 @@
 import typing
 from collections.abc import Callable, Iterable
 from itertools import islice
-from typing import Any
 
 import regex as re
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -142,8 +142,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -177,7 +175,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.layer_id = layer_id
@@ -204,8 +201,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -254,8 +250,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -289,7 +283,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.use_qk_norm = getattr(config, "use_qk_norm", False)
         self.layer_id = layer_id
@@ -314,8 +307,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -494,14 +486,6 @@ def __init__(
             if isinstance(config.intermediate_size, int)
             else config.intermediate_size[layer_id]
         )
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         attention_bias = getattr(config, "attention_bias", False) or getattr(
             config, "bias", False
@@ -520,8 +504,6 @@ def __init__(
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
@@ -537,8 +519,6 @@ def __init__(
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
@@ -597,7 +577,16 @@ def forward(
         return hidden_states, residual, ori_kv_states
 
 
-@support_torch_compile
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (xd, seq_len) if xdrope is enabled for hunyuan-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
 class HunYuanModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py
new file mode 100644
index 000000000000..2950db571e6e
--- /dev/null
+++ b/vllm/model_executor/models/hunyuan_vision.py
@@ -0,0 +1,1028 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# coding=utf-8
+# Copyright 2025 The HunYuan team.
+# Copyright 2025 The vLLM team.
+# Copyright 2025 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only HunYuan-VL model compatible with HuggingFace weights."""
+
+from collections.abc import Callable, Iterable, Mapping, Sequence
+from functools import partial
+from typing import Annotated, Any, Literal, TypeAlias
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BatchFeature
+
+from vllm.attention.backends.registry import AttentionBackendEnum
+from vllm.attention.layer import MultiHeadAttention
+from vllm.config import MultiModalConfig, VllmConfig
+from vllm.config.multimodal import BaseDummyOptions
+from vllm.distributed import parallel_state
+from vllm.distributed import utils as dist_utils
+from vllm.logger import init_logger
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    ColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    ImageItem,
+    ModalityData,
+    MultiModalDataDict,
+    MultiModalFeatureSpec,
+    MultiModalFieldConfig,
+    MultiModalKwargsItems,
+)
+from vllm.multimodal.parse import (
+    DictEmbeddingItems,
+    ImageSize,
+    MultiModalDataItems,
+    MultiModalDataParser,
+)
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    BaseProcessingInfo,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLVisionConfig,
+)
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import smart_resize
+from vllm.utils.tensor_schema import TensorSchema, TensorShape
+
+from .interfaces import (
+    MultiModalEmbeddings,
+    SupportsLoRA,
+    SupportsMultiModal,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+)
+from .utils import (
+    AutoWeightsLoader,
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+logger = init_logger(__name__)
+
+# === Vision Inputs === #
+
+
+class HunYuanVLImagePixelInputs(TensorSchema):
+    """
+    Dimensions:
+        - np: Number of patches
+        - ni: Number of images
+        - cps: Number of channels * patch_size * patch_size
+    """
+
+    type: Literal["pixel_values"]
+
+    pixel_values: Annotated[
+        torch.Tensor,
+        TensorShape("np", "cps"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+class HunYuanVLImageEmbeddingInputs(TensorSchema):
+    """
+    Dimensions:
+        - nf: Number of image features
+        - hs: Hidden size
+        - ni: Number of images
+    """
+
+    type: Literal["image_embeds"]
+
+    image_embeds: Annotated[
+        torch.Tensor,
+        TensorShape("nf", "hs"),
+    ]
+
+    image_grid_thw: Annotated[
+        torch.Tensor,
+        TensorShape("ni", 3),
+    ]
+
+
+HunYuanVLImageInputs: TypeAlias = (
+    HunYuanVLImagePixelInputs | HunYuanVLImageEmbeddingInputs
+)
+
+# === Vision Encoder === #
+
+
+class HunYuanVisionMLP(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int,
+        bias: bool = True,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ):
+        super().__init__()
+        self.dense_h_to_4h = ColumnParallelLinear(
+            in_features,
+            hidden_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_h_to_4h",
+            disable_tp=use_data_parallel,
+        )
+        self.dense_4h_to_h = RowParallelLinear(
+            hidden_features,
+            in_features,
+            bias=bias,
+            quant_config=quant_config,
+            prefix=f"{prefix}.dense_4h_to_h",
+            disable_tp=use_data_parallel,
+        )
+        self.act_fn = act_fn
+
+    def forward(self, x: torch.Tensor):
+        x_up, _ = self.dense_h_to_4h(x)
+        x_down, _ = self.dense_4h_to_h(self.act_fn(x_up))
+        return x_down
+
+
+class HunYuanVisionAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        projection_size: int,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        # Per attention head and per partition values.
+        self.tp_size = (
+            1
+            if use_data_parallel
+            else parallel_state.get_tensor_model_parallel_world_size()
+        )
+        self.hidden_size_per_attention_head = dist_utils.divide(
+            projection_size, num_heads
+        )
+        self.num_attention_heads_per_partition = dist_utils.divide(
+            num_heads, self.tp_size
+        )
+
+        self.qkv = QKVParallelLinear(
+            hidden_size=embed_dim,
+            head_size=self.hidden_size_per_attention_head,
+            total_num_heads=num_heads,
+            total_num_kv_heads=num_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv",
+            disable_tp=use_data_parallel,
+        )
+
+        self.o_proj = RowParallelLinear(
+            input_size=projection_size,
+            output_size=embed_dim,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+            disable_tp=use_data_parallel,
+        )
+
+        self.scale = self.hidden_size_per_attention_head**-0.5
+        self.attn = MultiHeadAttention(
+            self.num_attention_heads_per_partition,
+            self.hidden_size_per_attention_head,
+            self.scale,
+            prefix=f"{prefix}.attn",
+            multimodal_config=multimodal_config,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        out = self.attn(q, k, v)
+        output, _ = self.o_proj(out)
+        return output
+
+
+class HunYuanVisionBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_hidden_dim: int,
+        act_fn: Callable[[torch.Tensor], torch.Tensor] = F.gelu,
+        norm_layer: Callable[[int], nn.Module] | None = None,
+        quant_config: QuantizationConfig | None = None,
+        multimodal_config: MultiModalConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.input_layernorm = norm_layer(dim)
+        self.post_attention_layernorm = norm_layer(dim)
+        self.self_attn = HunYuanVisionAttention(
+            embed_dim=dim,
+            num_heads=num_heads,
+            projection_size=dim,
+            quant_config=quant_config,
+            multimodal_config=multimodal_config,
+            prefix=f"{prefix}.self_attn",
+            use_data_parallel=use_data_parallel,
+        )
+        self.mlp = HunYuanVisionMLP(
+            dim,
+            mlp_hidden_dim,
+            act_fn=act_fn,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.mlp",
+            use_data_parallel=use_data_parallel,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        x = x + self.self_attn(self.input_layernorm(x))
+        x = x + self.mlp(self.post_attention_layernorm(x))
+        return x
+
+
+class HunYuanVisionPatchEmbed(nn.Module):
+    def __init__(self, config: HunYuanVLVisionConfig):
+        super().__init__()
+
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.num_channels = config.num_channels
+        self.spatial_merge_size = config.spatial_merge_size
+        self.interpolate_mode = config.interpolate_mode
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=True,
+        )
+
+        self.max_num_patches = (config.max_image_size // self.patch_size) ** 2
+
+        self.num_positions = self.max_num_patches + 1
+        self.position_edge = int(self.num_positions**0.5)
+        # first token is cls token, skip it
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+
+        self.patch_pos_embed = None
+
+    def forward(
+        self, pixel_values: torch.Tensor, grid_thw: list[list[int]]
+    ) -> torch.Tensor:
+        num_patches = pixel_values.size(0)
+        pixel_values = pixel_values.reshape(
+            num_patches, self.num_channels, self.patch_size, self.patch_size
+        )
+
+        patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = patch_embeds.squeeze(-1).squeeze(-1).unsqueeze(0)
+
+        if self.patch_pos_embed is None:
+            patch_pos_shape = (
+                1,
+                self.position_edge,
+                self.position_edge,
+                self.embed_dim,
+            )
+            self.patch_pos_embed = (
+                self.position_embedding.weight[1:, :]
+                .reshape(patch_pos_shape)
+                .permute(0, 3, 1, 2)
+                .float()
+            )
+
+        patch_pos_embed_list = []
+        for grid in grid_thw:
+            _, h0, w0 = grid
+            # we add a small number to avoid floating point error in the interpolation
+            # see discussion at https://github.com/facebookresearch/dino/issues/8
+            h0, w0 = h0 + 0.1, w0 + 0.1
+            patch_pos_embed = nn.functional.interpolate(
+                self.patch_pos_embed,
+                scale_factor=(h0 / self.position_edge, w0 / self.position_edge),
+                mode=self.interpolate_mode,
+                align_corners=False,
+            )
+
+            patch_pos_embed = (
+                patch_pos_embed.reshape(self.embed_dim, -1)
+                .transpose(0, 1)
+                .unsqueeze(0)
+                .to(patch_embeds.dtype)
+            )
+            patch_pos_embed_list.append(patch_pos_embed)
+
+        patch_pos_embed = torch.cat(patch_pos_embed_list, dim=1)
+        embeddings = patch_embeds + patch_pos_embed
+
+        return embeddings
+
+
+class HunYuanVisionPatchMerger(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        spatial_merge_size=2,
+        rms_norm_eps=1e-5,
+        prefix="",
+    ):
+        super().__init__()
+        self.spatial_merge_size = spatial_merge_size
+        embed_std = out_channels**-0.5
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                in_channels * 2,
+                kernel_size=spatial_merge_size,
+                stride=spatial_merge_size,
+            ),
+            nn.GELU(),
+            nn.Conv2d(in_channels * 2, in_channels * 4, kernel_size=1),
+        )
+        self.mlp = nn.Linear(in_channels * 4, out_channels)
+
+        self.image_newline = nn.Parameter(torch.randn(in_channels * 4) * embed_std)
+        self.image_begin = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_end = nn.Parameter(torch.randn(out_channels) * embed_std)
+        self.image_sep = nn.Parameter(torch.randn(out_channels) * embed_std)
+
+        self.before_rms = RMSNorm(in_channels, eps=rms_norm_eps)
+        self.after_rms = RMSNorm(out_channels, eps=rms_norm_eps)
+
+    def forward(self, x, size=(16, 16)):
+        x = self.before_rms(x)
+
+        h, w = size
+        dtype = x.dtype
+        x = x.permute(0, 2, 1).reshape(x.shape[0], -1, h, w)
+
+        x = self.proj(x)  # b,c,h,w
+        b, c, h, w = x.shape
+        x = torch.cat(
+            [x, self.image_newline.reshape(1, c, 1, 1).expand(b, c, h, 1).to(dtype)],
+            dim=-1,
+        )
+        x = x.reshape(b, c, -1).permute(0, 2, 1)
+        x = self.mlp(x)
+
+        begin = self.image_begin.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        end = self.image_end.reshape(1, 1, -1).expand(b, 1, x.shape[-1]).to(dtype)
+        x = torch.cat([begin, x, end], dim=1)
+
+        return self.after_rms(x)
+
+
+class HunYuanVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        vision_config: HunYuanVLVisionConfig,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+        use_data_parallel: bool = False,
+        multimodal_config: MultiModalConfig | None = None,
+        attn_backend_override: AttentionBackendEnum | None = None,
+    ) -> None:
+        super().__init__()
+
+        num_hidden_layers = vision_config.num_hidden_layers
+        self.hidden_size = vision_config.hidden_size
+        self.num_heads = vision_config.num_attention_heads
+        self.spatial_merge_size = vision_config.spatial_merge_size
+
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("HunYuanVisionPatchEmbed"):
+            self.embeddings = HunYuanVisionPatchEmbed(vision_config)
+
+        norm_layer = partial(nn.LayerNorm, eps=vision_config.rms_norm_eps)
+
+        with set_model_tag("HunYuanVisionBlock"):
+            self.layers = nn.ModuleList(
+                [
+                    HunYuanVisionBlock(
+                        dim=vision_config.hidden_size,
+                        num_heads=vision_config.num_attention_heads,
+                        mlp_hidden_dim=vision_config.intermediate_size,
+                        act_fn=get_act_fn(vision_config.hidden_act),
+                        norm_layer=norm_layer,
+                        quant_config=quant_config,
+                        multimodal_config=multimodal_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                        use_data_parallel=use_data_parallel,
+                    )
+                    for layer_idx in range(num_hidden_layers)
+                ]
+            )
+
+        with set_model_tag("HunYuanVisionPatchMerger"):
+            self.perceive = HunYuanVisionPatchMerger(
+                vision_config.hidden_size,
+                vision_config.out_hidden_size,
+                spatial_merge_size=vision_config.spatial_merge_size,
+                rms_norm_eps=vision_config.rms_norm_eps,
+                prefix=f"{prefix}.perceive",
+            )
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.embeddings.patch_embedding.weight.dtype
+
+    @property
+    def device(self) -> torch.device:
+        return self.embeddings.patch_embedding.weight.device
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        grid_thw: list[list[int]],
+    ) -> torch.Tensor:
+        # patchify
+        seq_len = x.size(0)
+        cu_seqlens: list = [0]
+
+        hidden_states = x.to(device=self.device, dtype=self.dtype)
+        hidden_states = self.embeddings(hidden_states, grid_thw)
+
+        for t, h, w in grid_thw:
+            t, h, w = int(t), int(h), int(w)
+            cu_seqlens.append(h * w)
+
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32)
+        cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32)
+
+        cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
+
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        hidden_states = hidden_states.unsqueeze(0)
+        for layer_num, layer in enumerate(self.layers):
+            hidden_states = layer(hidden_states)
+
+        # adapter
+        split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+        split_items = hidden_states.split(split_lengths, dim=1)
+        image_embeds_list = []
+        for grid, split_item in zip(grid_thw, split_items):
+            image_embeds_list.append(
+                self.perceive(split_item.contiguous(), size=grid[1:]).squeeze(0)
+            )
+
+        return image_embeds_list
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".qkv", ".q_proj", "q"),
+            (".qkv", ".k_proj", "k"),
+            (".qkv", ".v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+def _hunyuan_vl_field_config(hf_inputs: Mapping[str, torch.Tensor]):
+    image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3)))
+    image_grid_sizes = image_grid_thw.prod(-1)
+    return dict(
+        pixel_values=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_embeds=MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes),
+        image_grid_thw=MultiModalFieldConfig.batched("image"),
+    )
+
+
+class HunYuanVLMultiModalDataParser(MultiModalDataParser):
+    def _parse_image_data(
+        self,
+        data: dict[str, torch.Tensor] | ModalityData[ImageItem],
+    ):
+        if isinstance(data, dict):
+            return DictEmbeddingItems(
+                data,
+                modality="image",
+                required_fields={"image_embeds", "image_grid_thw"},
+                fields_factory=_hunyuan_vl_field_config,
+            )
+
+        return super()._parse_image_data(data)
+
+
+class HunYuanVLProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(HunYuanVLConfig)
+
+    def get_hf_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.ctx.get_hf_processor(
+            HunYuanVLProcessor,
+            use_fast=kwargs.pop("use_fast", True),
+            **kwargs,
+        )
+
+    def get_image_processor(
+        self,
+        **kwargs: object,
+    ) -> HunYuanVLProcessor:
+        return self.get_hf_processor(**kwargs).image_processor
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens()
+        # TODO: support video
+        max_video_tokens = 0
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
+    def _get_vision_info(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        num_frames: int = 1,
+        do_resize: bool = True,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> tuple[ImageSize, int]:
+        if image_processor is None:
+            image_processor = self.get_image_processor()
+
+        hf_config = self.get_hf_config()
+        vision_config = hf_config.vision_config
+        patch_size = vision_config.patch_size
+        spatial_merge_size = vision_config.spatial_merge_size
+
+        if do_resize:
+            resized_height, resized_width = smart_resize(
+                height=image_height,
+                width=image_width,
+                factor=patch_size * spatial_merge_size,
+                min_pixels=image_processor.min_pixels,
+                max_pixels=image_processor.max_pixels,
+            )
+            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
+        else:
+            preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+        grid_t = 1
+        grid_h = preprocessed_size.height // patch_size
+        grid_w = preprocessed_size.width // patch_size
+
+        num_vision_tokens = (
+            grid_t * grid_h // spatial_merge_size * (grid_w // spatial_merge_size + 1)
+            + 2
+        )
+
+        return preprocessed_size, num_vision_tokens
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        image_processor: HunYuanVLProcessor | None,
+    ) -> int:
+        _, num_image_tokens = self._get_vision_info(
+            image_width=image_width,
+            image_height=image_height,
+            image_processor=image_processor,
+        )
+        return num_image_tokens
+
+    def get_image_size_with_most_features(self) -> ImageSize:
+        max_image_size, _ = self._get_vision_info(
+            image_width=512,
+            image_height=8192,
+            image_processor=None,
+        )
+        return max_image_size
+
+    def get_max_image_tokens(self) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            image_processor=None,
+        )
+
+
+class HunYuanVLDummyInputsBuilder(BaseDummyInputsBuilder[HunYuanVLProcessingInfo]):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        hf_processor = self.info.get_hf_processor()
+        image_token: str = hf_processor.image_token
+
+        return image_token * num_images
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        mm_options: Mapping[str, BaseDummyOptions] | None = None,
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 1)
+
+        target_width, target_height = self.info.get_image_size_with_most_features()
+
+        return {
+            "image": self._get_dummy_images(
+                width=target_width, height=target_height, num_images=num_images
+            ),
+        }
+
+
+class HunYuanVLMultiModalProcessor(BaseMultiModalProcessor[HunYuanVLProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return HunYuanVLMultiModalDataParser()
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+        tok_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        return self.info.ctx.call_hf_processor(
+            self.info.get_hf_processor(**mm_kwargs),
+            dict(text=prompt, **mm_data),
+            dict(**mm_kwargs, **tok_kwargs),
+        )
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargsItems,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+
+        placeholder = {
+            "image": hf_processor.image_token_id,
+        }
+
+        merge_size = image_processor.merge_size
+
+        def get_replacement_hunyuan_vl(item_idx: int, modality: str):
+            out_item = out_mm_kwargs[modality][item_idx]
+            grid_thw = out_item[f"{modality}_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            _, grid_h, grid_w = grid_thw
+            num_tokens = (int(grid_h) // merge_size) * (
+                int(grid_w) // merge_size + 1
+            ) + 2
+            return [placeholder[modality]] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality=modality,
+                target=[placeholder[modality]],
+                replacement=partial(get_replacement_hunyuan_vl, modality=modality),
+            )
+            for modality in ("image",)
+        ]
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _hunyuan_vl_field_config(hf_inputs)
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    HunYuanVLMultiModalProcessor,
+    info=HunYuanVLProcessingInfo,
+    dummy_inputs=HunYuanVLDummyInputsBuilder,
+)
+class HunYuanVLForConditionalGeneration(
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsQuant,
+    SupportsXDRoPE,
+):
+    multimodal_cpu_fields = {"image_grid_thw"}
+
+    # To ensure correct weight loading and mapping.
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            # mapping for new names in checkpoint saved after transformers v4.52
+            "vit.vit.": "visual.",
+            "vit.": "visual.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    supports_encoder_tp_data = True
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list[MultiModalFeatureSpec],
+    ) -> torch.Tensor:
+        kwargs = MultiModalFeatureSpec.gather_kwargs(
+            mm_features,
+            {"image_grid_thw"},
+        )
+        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
+
+        hf_config = self.config
+        image_start_token_id = hf_config.image_start_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        xd_num = len(hf_config.rope_scaling["xdrope_section"])
+
+        input_tokens_tensor = torch.tensor(input_tokens)
+        image_start_indices = torch.argwhere(
+            input_tokens_tensor == image_start_token_id
+        ).squeeze(1)
+
+        p_index = torch.arange(len(input_tokens_tensor))
+        w_index = torch.arange(len(input_tokens_tensor))
+        h_index = torch.arange(len(input_tokens_tensor))
+        t_index = torch.arange(len(input_tokens_tensor))
+        for image_index in range(len(image_start_indices)):
+            # +1 : first image_token, +2: for xdrope positions
+            pos = image_start_indices[image_index] + 2
+            t, h, w = image_grid_thw[image_index]
+            _, llm_grid_h, llm_grid_w = (
+                t,
+                h // spatial_merge_size,
+                w // spatial_merge_size,
+            )
+
+            token_num = (llm_grid_w + 1) * llm_grid_h
+            w_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_w + 1)
+                .reshape(1, -1)
+                .expand(llm_grid_h, -1)
+                .reshape(-1)
+            )
+            h_index[pos : pos + token_num].copy_(
+                torch.arange(0, llm_grid_h)
+                .reshape(-1, 1)
+                .expand(-1, llm_grid_w + 1)
+                .reshape(-1)
+            )
+            t_index[pos : pos + token_num] = image_index
+
+        if xd_num == 4:
+            llm_positions = torch.stack([p_index, w_index, h_index, t_index])
+        elif xd_num == 3:
+            llm_positions = torch.stack([w_index, h_index, t_index])
+
+        return llm_positions
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
+
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: HunYuanVLConfig = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        if multimodal_config.get_limit_per_prompt("image"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
+            self.visual = HunYuanVisionTransformer(
+                config.vision_config,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+                multimodal_config=multimodal_config,
+                attn_backend_override=attn_backend_override,
+            )
+        else:
+            self.visual = None
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model.model"),
+            architectures=[
+                "HunYuanDenseV1ForCausalLM",
+                "HunYuanMoEV1ForCausalLM",
+            ],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def _parse_and_validate_image_input(
+        self, **kwargs: object
+    ) -> HunYuanVLImageInputs | None:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        image_grid_thw = kwargs.pop("image_grid_thw", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        # TODO: refine
+        if isinstance(pixel_values, list):
+            pixel_values = torch.cat(pixel_values, dim=0)
+        if len(pixel_values.shape) == 3:
+            last_dim = pixel_values.shape[-1]
+            pixel_values = pixel_values.reshape(-1, last_dim)
+            image_grid_thw = image_grid_thw.reshape(-1, 3)
+
+        if pixel_values is not None:
+            return HunYuanVLImagePixelInputs(
+                type="pixel_values",
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+            )
+
+        if image_embeds is not None:
+            return HunYuanVLImageEmbeddingInputs(
+                type="image_embeds",
+                image_embeds=image_embeds,
+                image_grid_thw=image_grid_thw,
+            )
+
+    def _process_image_input(
+        self, image_input: HunYuanVLImageInputs
+    ) -> tuple[torch.Tensor, ...]:
+        grid_thw = image_input["image_grid_thw"]
+        assert grid_thw.ndim == 2
+        grid_thw_list = grid_thw.tolist()
+
+        if image_input["type"] == "image_embeds":
+            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
+        else:
+            pixel_values = image_input["pixel_values"]
+
+            # TODO: use_data_parallel (split image_embeds in visual)
+            image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list)
+
+        return image_embeds
+
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        mm_input_by_modality = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if (
+                input_key in ("pixel_values", "image_embeds")
+                and "image" not in mm_input_by_modality
+            ):
+                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
+                    **kwargs
+                )
+        return mm_input_by_modality
+
+    def get_language_model(self) -> torch.nn.Module:
+        return self.language_model
+
+    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
+        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not mm_input_by_modality:
+            return []
+
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in mm_input_by_modality:
+            multimodal_input = mm_input_by_modality[modality]
+            if modality == "image":
+                image_embeddings = self._process_image_input(multimodal_input)
+                multimodal_embeddings += tuple(image_embeddings)
+        return multimodal_embeddings
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None,
+        inputs_embeds: torch.Tensor | None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.language_model.compute_logits(hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model.model",
+            connector="visual.perceive",
+            tower_model="visual",
+        )
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 727c8ec0397c..06b8468e18db 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -30,6 +30,7 @@
 from vllm.attention.layer import MultiHeadAttention
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -60,7 +61,7 @@ def __init__(self, config: Idefics2VisionConfig):
         self.embed_dim = config.hidden_size
         self.image_size = config.image_size
         self.patch_size = config.patch_size
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 929bfaaee5cb..ccd5be42e65a 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -32,11 +32,13 @@
     from vllm.config import VllmConfig
     from vllm.model_executor.models.utils import WeightsMapper
     from vllm.multimodal.inputs import MultiModalFeatureSpec
+    from vllm.multimodal.registry import _ProcessorFactories
     from vllm.sequence import IntermediateTensors
 else:
     VllmConfig = object
     WeightsMapper = object
     MultiModalFeatureSpec = object
+    _ProcessorFactories = object
     IntermediateTensors = object
 
 logger = init_logger(__name__)
@@ -87,6 +89,11 @@ class SupportsMultiModal(Protocol):
     A set indicating CPU-only multimodal fields.
     """
 
+    _processor_factory: ClassVar[_ProcessorFactories]
+    """
+    Set internally by `MultiModalRegistry.register_processor`.
+    """
+
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
         """
@@ -336,10 +343,10 @@ class SupportsLoRA(Protocol):
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
     """
+    is_3d_moe_weight: ClassVar[bool] = False
     # The `embedding_module` and `embedding_padding_modules`
     # are empty by default.
     embedding_modules: ClassVar[dict[str, str]] = {}
-    embedding_padding_modules: ClassVar[list[str]] = []
     packed_modules_mapping: dict[str, list[str]] = {}
 
 
@@ -351,7 +358,6 @@ class _SupportsLoRAType(Protocol):
 
     packed_modules_mapping: dict[str, list[str]]
     embedding_modules: dict[str, str]
-    embedding_padding_modules: list[str]
 
 
 @overload
@@ -371,7 +377,6 @@ def supports_lora(
         lora_attrs = (
             "packed_modules_mapping",
             "embedding_modules",
-            "embedding_padding_modules",
         )
         missing_attrs = tuple(attr for attr in lora_attrs if not hasattr(model, attr))
 
@@ -586,13 +591,11 @@ class IsHybrid(Protocol):
     def get_mamba_state_shape_from_config(
         cls,
         vllm_config: VllmConfig,
-        use_v1: bool = True,
     ) -> tuple[tuple[int, int], tuple[int, int, int]]:
         """Calculate shapes for Mamba's convolutional and state caches.
 
         Args:
             vllm_config: vLLM config
-            use_v1: Get shapes for V1 (or V0)
 
         Returns:
             Tuple containing:
@@ -932,13 +935,73 @@ def supports_transcription(
 
 
 @runtime_checkable
-class SupportsEagle3(Protocol):
+class SupportsEagleBase(Protocol):
+    """Base interface for models that support EAGLE-based speculative decoding."""
+
+    has_own_lm_head: bool = False
+    """
+    A flag that indicates this model has trained its own lm_head.
+    """
+
+    has_own_embed_tokens: bool = False
+    """
+    A flag that indicates this model has trained its own input embeddings.
+    """
+
+
+@overload
+def supports_any_eagle(model: type[object]) -> TypeIs[type[SupportsEagleBase]]: ...
+
+
+@overload
+def supports_any_eagle(model: object) -> TypeIs[SupportsEagleBase]: ...
+
+
+def supports_any_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagleBase]] | TypeIs[SupportsEagleBase]:
+    """Check if model supports any EAGLE variant (1, 2, or 3)."""
+    return supports_eagle(model) or supports_eagle3(model)
+
+
+@runtime_checkable
+class SupportsEagle(SupportsEagleBase, Protocol):
+    """The interface required for models that support
+    EAGLE-1 and EAGLE-2 speculative decoding."""
+
+    supports_eagle: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports EAGLE-1 and EAGLE-2 
+    speculative decoding.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        MRO of your model class.
+    """
+
+
+@overload
+def supports_eagle(model: type[object]) -> TypeIs[type[SupportsEagle]]: ...
+
+
+@overload
+def supports_eagle(model: object) -> TypeIs[SupportsEagle]: ...
+
+
+def supports_eagle(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsEagle]] | TypeIs[SupportsEagle]:
+    return isinstance(model, SupportsEagle)
+
+
+@runtime_checkable
+class SupportsEagle3(SupportsEagleBase, Protocol):
     """The interface required for models that support
-    EAGLE3 speculative decoding."""
+    EAGLE-3 speculative decoding."""
 
     supports_eagle3: ClassVar[Literal[True]] = True
     """
-    A flag that indicates this model supports EAGLE3 
+    A flag that indicates this model supports EAGLE-3 
     speculative decoding.
 
     Note:
@@ -949,7 +1012,7 @@ class SupportsEagle3(Protocol):
     def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
         """
         Set which layers should output auxiliary
-        hidden states for EAGLE3.
+        hidden states for EAGLE-3.
 
         Args:
             layers: Tuple of layer indices that should output auxiliary
@@ -960,7 +1023,7 @@ def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
     def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
         """
         Get the layer indices that should output auxiliary hidden states
-        for EAGLE3.
+        for EAGLE-3.
 
         Returns:
             Tuple of layer indices for auxiliary hidden state outputs.
@@ -989,7 +1052,7 @@ class SupportsMRoPE(Protocol):
     supports_mrope: ClassVar[Literal[True]] = True
     """
     A flag that indicates this model supports M-RoPE.
-    
+
     Note:
         There is no need to redefine this flag if this class is in the
         MRO of your model class.
@@ -1030,3 +1093,52 @@ def supports_mrope(
     model: type[object] | object,
 ) -> TypeIs[type[SupportsMRoPE]] | TypeIs[SupportsMRoPE]:
     return isinstance(model, SupportsMRoPE)
+
+
+@runtime_checkable
+class SupportsXDRoPE(Protocol):
+    """The interface required for all models that support XD-RoPE."""
+
+    supports_xdrope: ClassVar[Literal[True]] = True
+    """
+    A flag that indicates this model supports XD-RoPE.
+
+    Note:
+        There is no need to redefine this flag if this class is in the
+        XDRope of your model class.
+    """
+
+    def get_xdrope_input_positions(
+        self,
+        input_tokens: list[int],
+        mm_features: list["MultiModalFeatureSpec"],
+    ) -> torch.Tensor:
+        """
+        Get XD-RoPE input positions and delta value for this specific model.
+
+        This method should be implemented by each model that supports XD-RoPE
+        to provide model-specific logic for computing input positions.
+
+        Args:
+            input_tokens: List of input token IDs
+            mm_features: Information about each multi-modal data item
+
+        Returns:
+            llm_positions: Tensor of shape `[xdrope_dim, num_tokens]` with
+            4D(P/W/H/T) or 3D(W/H/T) positions.
+        """
+        ...
+
+
+@overload
+def supports_xdrope(model: type[object]) -> TypeIs[type[SupportsXDRoPE]]: ...
+
+
+@overload
+def supports_xdrope(model: object) -> TypeIs[SupportsXDRoPE]: ...
+
+
+def supports_xdrope(
+    model: type[object] | object,
+) -> TypeIs[type[SupportsXDRoPE]] | TypeIs[SupportsXDRoPE]:
+    return isinstance(model, SupportsXDRoPE)
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 4267b6c6598e..2c99fce8d918 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -19,10 +19,14 @@
 
 if TYPE_CHECKING:
     from vllm.config import VllmConfig
+    from vllm.config.model import AttnTypeStr
+    from vllm.config.pooler import PoolingTypeStr
     from vllm.model_executor.layers.pooler import Pooler
 else:
     VllmConfig = Any
     Pooler = Any
+    PoolingTypeStr = Any
+    AttnTypeStr = Any
 
 logger = init_logger(__name__)
 
@@ -165,10 +169,9 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
         MRO of your model class.
     """
 
-    default_pooling_type: ClassVar[str] = "LAST"
+    default_pooling_type: ClassVar[PoolingTypeStr] = "LAST"
     """
-    Indicates the
-    [vllm.model_executor.layers.pooler.PoolerConfig.pooling_type][]
+    Indicates the [vllm.config.pooler.PoolerConfig.pooling_type][]
     to use by default.
 
     You can use the
@@ -176,6 +179,17 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
     decorator to conveniently set this field.
     """
 
+    attn_type: ClassVar[AttnTypeStr] = "decoder"
+    """
+    Indicates the
+    [vllm.config.model.ModelConfig.attn_type][]
+    to use by default.
+
+    You can use the
+    [vllm.model_executor.models.interfaces_base.attn_type][]
+    decorator to conveniently set this field.
+    """
+
     pooler: Pooler
     """The pooler is only called on TP rank 0."""
 
@@ -200,7 +214,7 @@ def is_pooling_model(
 _T = TypeVar("_T", bound=type[nn.Module])
 
 
-def default_pooling_type(pooling_type: str):
+def default_pooling_type(pooling_type: PoolingTypeStr):
     """Decorator to set `VllmModelForPooling.default_pooling_type`."""
 
     def func(model: _T) -> _T:
@@ -210,5 +224,19 @@ def func(model: _T) -> _T:
     return func
 
 
-def get_default_pooling_type(model: type[object] | object) -> str:
+def get_default_pooling_type(model: type[object] | object) -> PoolingTypeStr:
     return getattr(model, "default_pooling_type", "LAST")
+
+
+def attn_type(attn_type: AttnTypeStr):
+    """Decorator to set `VllmModelForPooling.attn_type`."""
+
+    def func(model: _T) -> _T:
+        model.attn_type = attn_type  # type: ignore
+        return model
+
+    return func
+
+
+def get_attn_type(model: type[object] | object) -> AttnTypeStr:
+    return getattr(model, "attn_type", "decoder")
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 03918127c6ae..61aeafc2ab43 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -24,6 +24,7 @@
     tensor_model_parallel_all_gather,
 )
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
@@ -51,7 +52,7 @@ def __init__(self, config: PretrainedConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=3,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 60fbeb842dd4..c79934e12144 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -10,7 +10,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -91,8 +91,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -120,7 +119,6 @@ def __init__(
         self.kv_size = self.num_kv_heads * self.head_dim
         self.key_value_groups = int(self.num_heads / self.num_kv_heads)
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.wqkv = QKVParallelLinear(
@@ -144,8 +142,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -204,15 +201,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 6dc081e34157..a57db82242af 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -30,15 +30,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.attention = InternLM2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/interns1_vit.py b/vllm/model_executor/models/interns1_vit.py
index 507503d75046..cb0414bbc95a 100644
--- a/vllm/model_executor/models/interns1_vit.py
+++ b/vllm/model_executor/models/interns1_vit.py
@@ -16,6 +16,7 @@
 
 from vllm.attention.layer import MultiHeadAttention
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -43,7 +44,7 @@ def __init__(self, config):
         self.num_patches = num_patches
         self.patch_shape = patch_shape
 
-        self.projection = nn.Conv2d(
+        self.projection = Conv2dLayer(
             num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
         )
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index ccbde115009d..fccddf3a6b29 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -50,7 +50,7 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
@@ -347,7 +347,7 @@ class BaseInternVLProcessor(ABC):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
@@ -561,7 +561,7 @@ class InternVLProcessor(BaseInternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 5549a1fc1cd3..6012288814f1 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -28,7 +28,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 3a2c98c73dab..b2ad12be1e35 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -480,7 +480,6 @@ class JambaForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
index 1eb0eccc0411..881760155814 100644
--- a/vllm/model_executor/models/keye.py
+++ b/vllm/model_executor/models/keye.py
@@ -9,6 +9,7 @@
 import numpy as np
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from einops import rearrange
 from transformers import PretrainedConfig
 from transformers.activations import GELUActivation
@@ -24,6 +25,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -204,7 +206,7 @@ def __init__(self, config: PretrainedConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -416,14 +418,13 @@ def __init__(
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa=False,
                 attn_backend_override=attn_backend_override,
             )
         )
 
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.XFORMERS,
+            AttentionBackendEnum.TORCH_SDPA,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -450,7 +451,6 @@ def forward(
         )
 
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
         batch_size = q.shape[0]
 
         if rope_emb is None:
@@ -497,17 +497,21 @@ def forward(
                 softmax_scale=self.scale,
             )
             context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
+        elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
+            outputs = []
+            for i in range(1, len(cu_seqlens)):
+                start_idx = cu_seqlens[i - 1]
+                end_idx = cu_seqlens[i]
+                q_i = q[:, start_idx:end_idx]
+                k_i = k[:, start_idx:end_idx]
+                v_i = v[:, start_idx:end_idx]
+                q_i, k_i, v_i = (
+                    rearrange(x, "b s h d -> b h s d") for x in (q_i, k_i, v_i)
+                )
+                output_i = F.scaled_dot_product_attention(q_i, k_i, v_i, dropout_p=0.0)
+                output_i = rearrange(output_i, "b h s d -> b s h d ")
+                outputs.append(output_i)
+            context_layer = torch.cat(outputs, dim=1) if outputs else q[:, :0]
 
         context_layer = rearrange(context_layer, "b s h d -> b s (h d)").contiguous()
 
diff --git a/vllm/model_executor/models/kimi_linear.py b/vllm/model_executor/models/kimi_linear.py
index f3675075a48f..4562b2202c5e 100644
--- a/vllm/model_executor/models/kimi_linear.py
+++ b/vllm/model_executor/models/kimi_linear.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
@@ -190,9 +189,7 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
         use_nope: bool = False,
-        rope_scaling: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -210,11 +207,9 @@ def __init__(
         tp_size = get_tensor_model_parallel_world_size()
         self.num_local_heads = num_heads // tp_size
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.use_nope = use_nope
         assert self.use_nope is True
         assert self.q_lora_rank is None
-        assert rope_scaling is None
         assert num_heads % tp_size == 0
         self.kv_a_proj_with_mqa = ReplicatedLinear(
             self.hidden_size,
diff --git a/vllm/model_executor/models/lfm2.py b/vllm/model_executor/models/lfm2.py
index aeb25602f11a..a4a994f97a2f 100644
--- a/vllm/model_executor/models/lfm2.py
+++ b/vllm/model_executor/models/lfm2.py
@@ -2,13 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 import torch.nn as nn
 from transformers import Lfm2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -96,8 +95,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -126,7 +123,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -149,8 +145,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -199,14 +194,6 @@ def __init__(
         self.config = config
         self.layer_idx = layer_idx
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         self.self_attn = Lfm2Attention(
@@ -215,8 +202,6 @@ def __init__(
             hidden_size=config.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
@@ -437,7 +422,6 @@ class Lfm2ForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/lfm2_moe.py b/vllm/model_executor/models/lfm2_moe.py
index 6b7b5564ee98..c8669de72dd0 100644
--- a/vllm/model_executor/models/lfm2_moe.py
+++ b/vllm/model_executor/models/lfm2_moe.py
@@ -2,12 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 import torch.nn as nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -189,8 +188,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -219,7 +216,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -242,8 +238,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -293,14 +288,6 @@ def __init__(
         self.config = config
         self.layer_idx = layer_idx
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         self.self_attn = Lfm2MoeAttention(
@@ -309,8 +296,6 @@ def __init__(
             hidden_size=config.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
@@ -617,7 +602,6 @@ class Lfm2MoeForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c49a1ea817f9..8f5a967cd422 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -26,13 +26,13 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -48,7 +48,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -58,7 +57,7 @@
 )
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle, SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
     AutoWeightsLoader,
     PPMissingLayer,
@@ -120,8 +119,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -157,7 +154,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         llama_4_scaling_config = getattr(config, "llama_4_scaling", None)
@@ -186,9 +182,7 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         sliding_window = None
         if layer_types := getattr(config, "layer_types", None):
@@ -258,7 +252,6 @@ def forward(
     def _init_rotary_emb(
         self,
         config: LlamaConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -270,8 +263,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=getattr(config, "rope_parameters", None),
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -291,14 +283,6 @@ def __init__(
         quant_config = self.get_quant_config(vllm_config)
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -326,8 +310,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
@@ -373,7 +355,17 @@ def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None
         return vllm_config.quant_config
 
 
-@support_torch_compile
+def llama_model_invariants(
+    input_ids, positions, intermediate_tensors=None, inputs_embeds=None
+):
+    """Shape invariants for Llama model compilation, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    if input_ids is not None:
+        torch._check(positions.size()[0] == input_ids.size()[0])
+
+
+@support_torch_compile(shape_invariants=llama_model_invariants)
 class LlamaModel(nn.Module):
     def __init__(
         self,
@@ -386,24 +378,18 @@ def __init__(
 
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
-        self.org_vocab_size = config.vocab_size
+
+        self.vocab_size = config.vocab_size
+
         if get_pp_group().is_first_rank or (
             config.tie_word_embeddings and get_pp_group().is_last_rank
         ):
             self.embed_tokens = VocabParallelEmbedding(
                 self.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
                 quant_config=quant_config,
             )
         else:
@@ -529,7 +515,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
+class LlamaForCausalLM(
+    nn.Module, SupportsLoRA, SupportsPP, SupportsEagle, SupportsEagle3
+):
     packed_modules_mapping = {
         "qkv_proj": ["q_proj", "k_proj", "v_proj"],
         "gate_up_proj": ["gate_proj", "up_proj"],
@@ -540,7 +528,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
@@ -578,9 +565,7 @@ def __init__(
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         self.config = config
-        self.lora_config = lora_config
 
         self.model = self._init_model(
             vllm_config=vllm_config,
@@ -589,20 +574,9 @@ def __init__(
         )
 
         if get_pp_group().is_last_rank:
-            self.unpadded_vocab_size = config.vocab_size
-            if lora_config:
-                self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
             self.lm_head = ParallelLMHead(
-                self.unpadded_vocab_size,
+                config.vocab_size,
                 config.hidden_size,
-                org_num_embeddings=config.vocab_size,
-                padding_size=(
-                    DEFAULT_VOCAB_PADDING_SIZE
-                    # We need bigger padding if using lora for kernel
-                    # compatibility
-                    if not lora_config
-                    else lora_config.lora_vocab_padding_size
-                ),
                 quant_config=quant_config,
                 prefix=maybe_prefix(prefix, "lm_head"),
             )
@@ -611,7 +585,7 @@ def __init__(
 
             logit_scale = getattr(config, "logit_scale", 1.0)
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, config.vocab_size, logit_scale
+                config.vocab_size, scale=logit_scale
             )
         else:
             self.lm_head = PPMissingLayer()
diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py
index a7e0732ec71e..423be45e8014 100644
--- a/vllm/model_executor/models/llama4.py
+++ b/vllm/model_executor/models/llama4.py
@@ -19,13 +19,12 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import Llama4TextConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -54,6 +53,7 @@
 from .llama import LlamaForCausalLM, LlamaMLP, LlamaModel
 from .utils import (
     AutoWeightsLoader,
+    PPMissingLayer,
     extract_layer_index,
     fast_topk,
     is_pp_missing_parameter,
@@ -171,8 +171,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -208,7 +206,6 @@ def __init__(
 
         self.floor_scale = getattr(config, "floor_scale", 8192.0)
         self.attn_scale = getattr(config, "attn_scale", 0.1)
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.n_rep = self.num_heads // self.num_kv_heads
         self.qk_norm = (
@@ -248,8 +245,7 @@ def __init__(
                 self.head_dim,
                 rotary_dim=self.head_dim,
                 max_position=max_position_embeddings,
-                base=int(rope_theta),
-                rope_scaling=rope_scaling if rope_scaling != "default" else None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=is_neox_style,
             )
             if not self.nope
@@ -331,8 +327,6 @@ def __init__(
         self.layer_idx = extract_layer_index(prefix)
         self.global_layer = config.no_rope_layers[self.layer_idx] == 0
         self.hidden_size = config.hidden_size
-        rope_theta = config.rope_theta
-        rope_scaling = config.rope_scaling
         max_position_embeddings = config.max_position_embeddings
 
         self.self_attn = Llama4Attention(
@@ -340,8 +334,6 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=False,
@@ -738,6 +730,9 @@ def set_moe_parameters(self):
         self.moe_layers = []
         example_moe = None
         for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
             assert isinstance(layer, Llama4DecoderLayer)
             if isinstance(layer.feed_forward, Llama4MoE):
                 # Pick last one layer since the first ones may be dense layers.
@@ -774,6 +769,9 @@ def update_physical_experts_metadata(
         self.num_local_physical_experts = num_local_physical_experts
         self.num_redundant_experts = num_physical_experts - self.num_logical_experts
         for layer in self.model.layers:
+            if isinstance(layer, PPMissingLayer):
+                continue
+
             if isinstance(layer.feed_forward, Llama4MoE):
                 moe = layer.feed_forward
                 moe.n_local_physical_experts = num_local_physical_experts
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
index e8716d652415..0146b3057928 100644
--- a/vllm/model_executor/models/llama4_eagle.py
+++ b/vllm/model_executor/models/llama4_eagle.py
@@ -23,7 +23,6 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -35,7 +34,7 @@
 from vllm.model_executor.models.utils import extract_layer_index
 
 from .interfaces import SupportsMultiModal
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import AutoWeightsLoader, maybe_prefix, process_eagle_weight
 
 logger = init_logger(__name__)
 
@@ -127,17 +126,11 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         for name in params_dict:
-            # if PP disabled then draft will share embed with target
-            if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                continue
             assert name in loaded_params, f"{name} is not loaded!"
         return loaded_params
 
@@ -212,6 +205,7 @@ def transform(inputs):
             name, weight = self.permute_qk_weight_for_rotary(name, loaded_weight)
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index ab2a9f6f06db..05cb456e7776 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -9,15 +9,23 @@
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
-from vllm.distributed.parallel_state import get_pp_group
 from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
@@ -40,14 +48,7 @@ def __init__(
 
     def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Use drafter's quantization config instead of verifier's."""
-        draft_model_config = vllm_config.speculative_config.draft_model_config
-        draft_load_config = vllm_config.load_config
-
-        return (
-            VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
-            if draft_model_config
-            else None
-        )
+        return get_draft_quant_config(vllm_config)
 
 
 @support_torch_compile
@@ -63,6 +64,9 @@ def __init__(
         self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
         self.embed_tokens = VocabParallelEmbedding(
             self.config.vocab_size,
             self.config.hidden_size,
@@ -80,8 +84,14 @@ def __init__(
                 for i in range(self.config.num_hidden_layers)
             ]
         )
-        self.fc = torch.nn.Linear(
-            self.config.hidden_size * 2, self.config.hidden_size, bias=False
+        self.fc = ReplicatedLinear(
+            input_size=self.config.hidden_size * 2,
+            output_size=self.config.hidden_size,
+            bias=False,
+            params_dtype=vllm_config.model_config.dtype,
+            quant_config=self.quant_config,
+            prefix=maybe_prefix(prefix, "fc"),
+            return_bias=False,
         )
 
     def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -117,6 +127,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_dict = dict(self.named_parameters())
         loaded_params: set[str] = set()
         for name, loaded_weight in weights:
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -126,10 +154,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                # if PP disabled then draft will share embed with target
-                if get_pp_group().world_size == 1 and "embed_tokens." in name:
-                    continue
-
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
@@ -179,6 +203,7 @@ def transform(inputs):
             name, loaded_weight = inputs
             if "lm_head" not in name:
                 name = "model." + name
+            process_eagle_weight(self, name)
             return name, loaded_weight
 
         loader = AutoWeightsLoader(
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 6edc9519dfbb..7a57644db1b1 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -11,19 +11,26 @@
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.linear import QKVParallelLinear
+from vllm.model_executor.layers.linear import QKVParallelLinear, ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
 from vllm.model_executor.models.llama import LlamaDecoderLayer, LlamaForCausalLM
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import NestedTensors
 
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    get_draft_quant_config,
+    maybe_prefix,
+    process_eagle_weight,
+)
 
 logger = init_logger(__name__)
 
@@ -66,14 +73,7 @@ def __init__(
 
     def get_quant_config(self, vllm_config: VllmConfig) -> QuantizationConfig | None:
         """Use drafter's quantization config instead of verifier's."""
-        draft_model_config = vllm_config.speculative_config.draft_model_config
-        draft_load_config = vllm_config.load_config
-
-        return (
-            VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
-            if draft_model_config
-            else None
-        )
+        return get_draft_quant_config(vllm_config)
 
     def _norm_before_residual(
         self, hidden_states: torch.Tensor
@@ -120,13 +120,12 @@ def forward(
 
 
 @support_torch_compile(
-    # torch.compile is disabled for multimodal EAGLE3 models due to constraint
-    # violations with dynamic shapes during tensor concatenation operations.
-    # See: https://github.com/vllm-project/vllm/pull/22872/files#r2362028132
-    # Non-multimodal EAGLE3 models can still use torch.compile safely.
-    enable_if=lambda vllm_config: not MULTIMODAL_REGISTRY.supports_multimodal_inputs(
-        vllm_config.model_config
-    ),
+    dynamic_arg_dims={
+        "input_ids": 0,
+        "positions": -1,
+        "hidden_states": 0,
+        "input_embeds": 0,
+    }
 )
 class LlamaModel(nn.Module):
     def __init__(
@@ -140,6 +139,15 @@ def __init__(
         self.config = vllm_config.speculative_config.draft_model_config.hf_config
         self.vocab_size = self.config.vocab_size
 
+        # Get drafter's quantization config
+        self.quant_config = get_draft_quant_config(vllm_config)
+
+        eagle_config = getattr(self.config, "eagle_config", None)
+        if eagle_config is not None and "use_aux_hidden_state" in eagle_config:
+            self.use_aux_hidden_state = eagle_config["use_aux_hidden_state"]
+        else:
+            self.use_aux_hidden_state = True
+
         current_vllm_config = get_current_vllm_config()
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -159,13 +167,19 @@ def __init__(
                 for layer_idx in range(self.config.num_hidden_layers)
             ]
         )
-        if hasattr(self.config, "target_hidden_size"):
-            self.fc = torch.nn.Linear(
-                self.config.target_hidden_size * 3, self.config.hidden_size, bias=False
-            )
-        else:
-            self.fc = torch.nn.Linear(
-                self.config.hidden_size * 3, self.config.hidden_size, bias=False
+        if self.use_aux_hidden_state:
+            if hasattr(self.config, "target_hidden_size"):
+                fc_input_size = self.config.target_hidden_size * 3
+            else:
+                fc_input_size = self.config.hidden_size * 3
+            self.fc = ReplicatedLinear(
+                input_size=fc_input_size,
+                output_size=self.config.hidden_size,
+                bias=False,
+                params_dtype=vllm_config.model_config.dtype,
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "fc"),
+                return_bias=False,
             )
         self.norm = RMSNorm(
             self.config.hidden_size,
@@ -211,6 +225,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         for name, loaded_weight in weights:
             if "midlayer." in name:
                 name = name.replace("midlayer.", "layers.0.")
+            # Handle kv cache quantization scales
+            if self.quant_config is not None and (
+                scale_name := self.quant_config.get_cache_scale(name)
+            ):
+                # Loading kv cache quantization scales
+                param = params_dict[scale_name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                loaded_weight = (
+                    loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                )
+                weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
+                continue
+            # Remapping the name FP8 kv-scale
+            if "scale" in name:
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -306,6 +338,8 @@ def combine_hidden_states(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
+        if not self.model.use_aux_hidden_state:
+            return hidden_states
         # combine multiple auxiliary hidden states returned by eagle3
         return self.model.fc(hidden_states)
 
@@ -324,12 +358,15 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             if "embed_tokens" in name:
                 includes_embed_tokens = True
             model_weights[name] = loaded_weight
+            process_eagle_weight(self, name)
 
         skip_substrs = []
         if not includes_draft_id_mapping:
             skip_substrs.append("draft_id_to_target_id")
         if not includes_embed_tokens:
             skip_substrs.append("embed_tokens")
+        if not self.model.use_aux_hidden_state:
+            skip_substrs.append("fc.")
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=None,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 98b1b46045c3..b995cac47ac1 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -460,7 +460,7 @@ def _process_image_input(
         image_input: LlavaNextImageInputs,
     ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            return [image_input["data"]]
+            return image_input["data"]
 
         patch_embeddings = self._process_image_pixels(image_input)
 
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 322bde94ff66..4e243ade6835 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -763,7 +763,7 @@ def _process_image_input(
         image_input: LlavaOnevisionImageInputs,
     ) -> torch.Tensor | list[torch.Tensor]:
         if image_input["type"] == "image_embeds":
-            return [image_input["data"]]
+            return image_input["data"]
 
         patch_embeddings = self._process_image_pixels(image_input)
 
diff --git a/vllm/model_executor/models/longcat_flash.py b/vllm/model_executor/models/longcat_flash.py
index 5de10e708683..c5441283f971 100644
--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
@@ -108,8 +108,7 @@ def __init__(
         eos_token_id=100001,
         pretraining_tp=1,
         tie_word_embeddings=False,
-        rope_theta=1000000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         attention_bias=False,
         attention_dropout=0.0,
         mla_scale_q_lora=False,
@@ -119,7 +118,7 @@ def __init__(
         router_dtype="float32",
         router_bias=False,
         topk_method=None,
-        routed_scaling_factor=None,
+        routed_scaling_factor=1.0,
         zero_expert_num=0,
         zero_expert_type=None,
         nextn_use_scmoe=False,
@@ -162,8 +161,13 @@ def __init__(
         self.rms_norm_eps = rms_norm_eps
         self.pretraining_tp = pretraining_tp
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 1000000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.mla_scale_q_lora = mla_scale_q_lora
@@ -336,15 +340,7 @@ def __init__(
         super().__init__()
         self.layer_idx = int(prefix.split(sep=".")[-1])
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
 
         # Dual attention structure
         self.self_attn = nn.ModuleList(
@@ -361,8 +357,6 @@ def __init__(
                         config.q_lora_rank if hasattr(config, "q_lora_rank") else None
                     ),
                     kv_lora_rank=config.kv_lora_rank,
-                    rope_theta=rope_theta,
-                    rope_scaling=rope_scaling,
                     max_position_embeddings=max_position_embeddings,
                     cache_config=cache_config,
                     quant_config=None
diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py
index fc17f98be198..5fcfa9431230 100644
--- a/vllm/model_executor/models/mamba2.py
+++ b/vllm/model_executor/models/mamba2.py
@@ -87,8 +87,7 @@ def forward(
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
 
-        output = torch.empty_like(hidden_states)
-        self.mixer(hidden_states, output)
+        output = self.mixer(hidden_states)
         return output, residual
 
 
diff --git a/vllm/model_executor/models/midashenglm.py b/vllm/model_executor/models/midashenglm.py
index a84c99059cd9..d9b23811730d 100644
--- a/vllm/model_executor/models/midashenglm.py
+++ b/vllm/model_executor/models/midashenglm.py
@@ -39,6 +39,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -120,7 +121,7 @@ def __init__(
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = flatten
 
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             in_chans,
             embed_dim,
             kernel_size=self.patch_size,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 914b097fe199..67c462f4b25c 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -33,7 +33,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
@@ -230,8 +230,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -257,7 +256,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -281,8 +279,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         self.attn = Attention(
@@ -324,8 +321,6 @@ def __init__(
         self.cache_config = cache_config
         self.quant_config = quant_config
         self.hidden_size = config.hidden_size
-        self.rope_theta = getattr(config, "rope_theta", 10000)
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
@@ -339,8 +334,7 @@ def _init_attn_block(self):
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
             num_kv_heads=self.config.num_key_value_heads,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=self.config.rope_parameters,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
@@ -574,7 +568,6 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle3):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index d3b6966ee3a7..0a2bcbd7f608 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -25,13 +25,11 @@
 # limitations under the License.
 """Inference-only MiniCPM3 model compatible with HuggingFace weights."""
 
-from typing import Any
-
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -62,8 +60,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -84,7 +80,6 @@ def __init__(
         self.num_local_heads = num_heads // tp_size
 
         self.scaling = self.qk_head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.q_a_proj = ReplicatedLinear(
@@ -127,8 +122,7 @@ def __init__(
             self.qk_rope_head_dim,
             rotary_dim=self.qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_local_heads,
@@ -204,8 +198,6 @@ def _init_attn_block(self):
             v_head_dim=self.config.v_head_dim,
             q_lora_rank=self.config.q_lora_rank,
             kv_lora_rank=self.config.kv_lora_rank,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py
index 0ca31913485d..9f3587a6d2fa 100644
--- a/vllm/model_executor/models/minicpm_eagle.py
+++ b/vllm/model_executor/models/minicpm_eagle.py
@@ -43,7 +43,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsLoRA, SupportsPP
+from .interfaces import SupportsEagle, SupportsLoRA, SupportsPP
 from .minicpm import MiniCPMAttention as EagleMiniCPMAttention
 from .minicpm import MiniCPMMLP as EagleMiniCPMMLP
 from .minicpm import MiniCPMMoE as EagleMiniCPMMoE
@@ -52,6 +52,7 @@
     is_pp_missing_parameter,
     make_empty_intermediate_tensors_factory,
     maybe_prefix,
+    process_eagle_weight,
 )
 
 
@@ -68,8 +69,6 @@ def __init__(
         self.cache_config = cache_config
         self.quant_config = quant_config
         self.hidden_size = config.hidden_size
-        self.rope_theta = getattr(config, "rope_theta", 10000)
-        self.rope_scaling = getattr(config, "rope_scaling", None)
         self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.prefix = prefix
         self._init_attn_block()
@@ -83,8 +82,7 @@ def _init_attn_block(self):
             hidden_size=self.hidden_size,
             num_heads=self.config.num_attention_heads,
             num_kv_heads=self.config.num_key_value_heads,
-            rope_theta=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=self.config.rope_parameters,
             max_position_embeddings=self.max_position_embeddings,
             cache_config=self.cache_config,
             quant_config=self.quant_config,
@@ -289,7 +287,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         return loaded_params
 
 
-class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
+class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, SupportsEagle):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -307,7 +305,6 @@ class EagleMiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -376,8 +373,13 @@ def compute_logits(
         return logits
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        def transform(inputs):
+            name, loaded_weight = inputs
+            process_eagle_weight(self, name)
+            return name, loaded_weight
+
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
         )
-        return loader.load_weights(weights)
+        return loader.load_weights(map(transform, weights))
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 2ac97764dd34..6d0ebf5c9825 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -1741,5 +1741,4 @@ def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
         # so update values before init is called
         cls.packed_modules_mapping.update(instance_cls.packed_modules_mapping)
         cls.embedding_modules.update(instance_cls.embedding_modules)
-        cls.embedding_padding_modules += instance_cls.embedding_padding_modules
         return instance_cls(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py
index 49d2f2d26196..dd98e36ec085 100644
--- a/vllm/model_executor/models/minimax_m2.py
+++ b/vllm/model_executor/models/minimax_m2.py
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
@@ -149,8 +149,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         rotary_dim: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         attn_window_size: int | None = None,
         max_position_embeddings: int = 8192,
         head_dim: int | None = None,
@@ -180,7 +179,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -205,8 +203,7 @@ def __init__(
             self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -252,8 +249,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         if hasattr(config, "max_model_len") and isinstance(config.max_model_len, int):
             max_position_embeddings = max(
@@ -269,8 +264,7 @@ def __init__(
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
             rotary_dim=config.rotary_dim,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index bf1ecc822756..390de78cc27b 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -14,7 +14,8 @@
 from torch import nn
 from transformers import MiniMaxConfig
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed.parallel_state import (
@@ -188,7 +189,7 @@ def __init__(
         num_kv_heads: int,
         rotary_dim: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
+        rope_parameters: dict | None = None,
         sliding_window: int | None = None,
         quant_config: QuantizationConfig | None = None,
         layer_idx: int = None,
@@ -214,7 +215,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.sliding_window = sliding_window
         self.prefix = prefix
 
@@ -247,7 +247,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=rotary_dim,
             max_position=max_position,
-            base=int(rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
             dtype=torch.float32,
         )
@@ -287,8 +287,6 @@ def __init__(
         self.hidden_size = config.hidden_size
         self.expert_num = expert_num
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-
         head_dim = getattr(config, "head_dim", None)
         if head_dim is None:
             head_dim = config.hidden_size // config.num_attention_heads
@@ -328,7 +326,7 @@ def __init__(
                 else head_dim,
                 num_kv_heads=config.num_key_value_heads,
                 max_position=max_position_embeddings,
-                rope_theta=rope_theta,
+                rope_parameters=config.rope_parameters,
                 sliding_window=config.sliding_window,
                 quant_config=quant_config,
                 layer_idx=self._ilayer,
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index d7a1cb82fb4f..50ec57e7a805 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import MixtralConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -51,7 +51,6 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    DEFAULT_VOCAB_PADDING_SIZE,
     ParallelLMHead,
     VocabParallelEmbedding,
 )
@@ -161,7 +160,6 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -189,7 +187,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -211,7 +208,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
@@ -248,15 +245,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = MixtralAttention(
             config=config,
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
@@ -306,23 +300,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         parallel_config = vllm_config.parallel_config
 
         self.config = config
         self.quant_config = quant_config
-        lora_vocab = (
-            (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1))
-            if lora_config
-            else 0
-        )
-        self.vocab_size = config.vocab_size + lora_vocab
+
+        self.vocab_size = config.vocab_size
         self.org_vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(
             self.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
         )
 
         self.enable_eplb = parallel_config.enable_eplb
@@ -507,40 +496,29 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP, MixtureOfExperts):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
+
         self.config = config
-        self.lora_config = lora_config
+
         self.quant_config = quant_config
 
         self.model = MixtralModel(
             vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
         )
-        self.unpadded_vocab_size = config.vocab_size
-        if lora_config:
-            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+
         self.lm_head = ParallelLMHead(
-            self.unpadded_vocab_size,
+            config.vocab_size,
             config.hidden_size,
-            org_num_embeddings=config.vocab_size,
-            padding_size=DEFAULT_VOCAB_PADDING_SIZE
-            # We need bigger padding if using lora for kernel
-            # compatibility
-            if not lora_config
-            else lora_config.lora_vocab_padding_size,
             quant_config=quant_config,
             prefix=maybe_prefix(prefix, "lm_head"),
         )
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
-        self.logits_processor = LogitsProcessor(
-            self.unpadded_vocab_size, config.vocab_size
-        )
+        self.logits_processor = LogitsProcessor(config.vocab_size)
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors
         )
diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py
index 14e741f32258..286859d188d3 100644
--- a/vllm/model_executor/models/mllama4.py
+++ b/vllm/model_executor/models/mllama4.py
@@ -35,6 +35,7 @@
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -45,6 +46,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.utils import initialize_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (
     MultiModalDataDict,
@@ -68,11 +70,15 @@
     MixtureOfExperts,
     MultiModalEmbeddings,
     SupportsEagle3,
+    SupportsLoRA,
     SupportsMultiModal,
     SupportsPP,
 )
 from .llama4 import Llama4ForCausalLM
-from .utils import AutoWeightsLoader, maybe_prefix
+from .utils import (
+    AutoWeightsLoader,
+    maybe_prefix,
+)
 from .vision import run_dp_sharded_vision_model
 
 
@@ -286,13 +292,17 @@ def __init__(
                 prefix=f"{prefix}.o_proj",
             )
 
+        rope_parameters = {
+            "rope_type": "mllama4",
+            "rope_theta": config.rope_parameters["rope_theta"],
+        }
+
         self.rotary_emb = get_rope(
             head_size=self.head_dim,
             rotary_dim=config.hidden_size // config.num_attention_heads // 2,
             # number of image patches
             max_position=(config.image_size // config.patch_size) ** 2,
-            base=config.rope_theta,
-            rope_scaling={"rope_type": "mllama4"},
+            rope_parameters=rope_parameters,
             is_neox_style=False,
             dtype=torch.complex64,  # important
         )
@@ -724,7 +734,12 @@ def get_dummy_mm_data(
     dummy_inputs=Mllama4DummyInputsBuilder,
 )
 class Llama4ForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsPP, MixtureOfExperts, SupportsEagle3
+    nn.Module,
+    SupportsMultiModal,
+    SupportsPP,
+    MixtureOfExperts,
+    SupportsEagle3,
+    SupportsLoRA,
 ):
     merge_by_field_config = True
 
@@ -1067,6 +1082,17 @@ def _load_other_weights(
 
         return updated_params
 
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        # Params for weights, fp8 weight scales, fp8 activation scales
+        # (param_name, weight_name, expert_id, shard_id)
+        return FusedMoE.make_expert_params_mapping(
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=self.config.text_config.num_local_experts,
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -1113,3 +1139,13 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         )
 
         return updated_params
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="language_model",
+            connector="multi_modal_projector.",
+            tower_model="vision_model.",
+        )
diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py
index 3a8a6c74d9d1..743bc23d9876 100644
--- a/vllm/model_executor/models/modernbert.py
+++ b/vllm/model_executor/models/modernbert.py
@@ -28,7 +28,7 @@
 from vllm.v1.pool.metadata import PoolingMetadata
 
 from .interfaces import SupportsCrossEncoding
-from .interfaces_base import default_pooling_type
+from .interfaces_base import attn_type, default_pooling_type
 from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix
 
 
@@ -396,6 +396,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return self.norm(self.act(self.dense(hidden_states)))
 
 
+@attn_type("encoder_only")
 @default_pooling_type("ALL")
 class ModernBertForTokenClassification(nn.Module):
     is_pooling_model = True
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index ab83a271e30a..7b53299cccbe 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -17,8 +17,7 @@
 from transformers.image_utils import ImageInput
 from transformers.tokenization_utils_base import TextInput
 
-from vllm.attention import Attention
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -410,7 +409,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
 
         # Attention input projection. Projects x -> (q, k, v)
         self.qkv_proj = QKVParallelLinear(
@@ -437,7 +435,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py
index 8017c947bf9a..63ea6b259a71 100644
--- a/vllm/model_executor/models/moonvit.py
+++ b/vllm/model_executor/models/moonvit.py
@@ -53,12 +53,16 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import is_flash_attn_2_available
 
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.models.utils import maybe_prefix
+from vllm.platforms import current_platform
 from vllm.transformers_utils.configs.moonvit import MoonViTConfig
 
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_varlen_func
+elif current_platform.is_xpu():
+    from vllm.attention.utils.fa_utils import flash_attn_varlen_func
 else:
     flash_attn_varlen_func = None
 
@@ -105,10 +109,10 @@ def multihead_attention(
         q,
         k,
         v,
-        q_cu_seqlens,
-        k_cu_seqlens,
-        max_seqlen_q,
-        max_seqlen_k,
+        cu_seqlens_q=q_cu_seqlens,
+        cu_seqlens_k=k_cu_seqlens,
+        max_seqlen_q=max_seqlen_q,
+        max_seqlen_k=max_seqlen_k,
         causal=False,
     )
     attn_out = attn_out.flatten(start_dim=-2)
@@ -244,7 +248,7 @@ def __init__(
         )
         self.patch_size = patch_size
 
-        self.proj = nn.Conv2d(
+        self.proj = Conv2dLayer(
             in_dim, out_dim, kernel_size=patch_size, stride=patch_size
         )
 
@@ -290,7 +294,12 @@ class Rope2DPosEmb(nn.Module):
     """
 
     def __init__(
-        self, dim: int, max_height: int, max_width: int, theta_base=10000, device="cuda"
+        self,
+        dim: int,
+        max_height: int,
+        max_width: int,
+        theta_base=10000,
+        device=current_platform.device_type,
     ):
         super().__init__()
         self.dim = dim
@@ -436,7 +445,7 @@ def __init__(
         self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads
         self.attn_implementation = attn_implementation
         # use fa2 in vllm by default
-        if is_flash_attn_2_available():
+        if is_flash_attn_2_available() or current_platform.is_xpu():
             self.attn_implementation = "flash_attention_2"
 
         self.norm0 = nn.LayerNorm(hidden_dim)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 106ad971a321..1e285646b9ec 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -10,7 +10,7 @@
 import torch.nn as nn
 from transformers import MptConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index cb39c2ae482d..11beeddabe30 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -73,9 +73,9 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs.radio import RadioConfig
 from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
     cached_tokenizer_from_config,
     encode_tokens,
 )
@@ -284,7 +284,7 @@ class BaseNanoNemotronVLProcessor(ABC):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *args,
         max_num_tiles: int | None = None,
         **kwargs,
@@ -434,7 +434,7 @@ class NanoNemotronVLProcessor(BaseNanoNemotronVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         max_num_tiles: int | None = None,
         min_dynamic_patch: int | None = None,
@@ -645,7 +645,7 @@ def get_video_repl(
         tokens_per_frame: list[int],
         frames_indices: list[int],
         frame_duration_ms: int,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         img_start_token_ids: list[int],
         img_end_token_ids: list[int],
         img_context_token_ids: list[int],
@@ -670,7 +670,7 @@ def get_video_repl(
             tokens_per_frame (list[int]): number of tokens per frame
             frames_indices (list[int]): frame indices
             frame_duration_ms (int): duration of each frame in milliseconds
-            tokenizer (AnyTokenizer): tokenizer to use for tokenizing frame separators
+            tokenizer (TokenizerLike): tokenizer to use for tokenizing frame separators
             img_start_token_ids (list[int]): pre-tokenized IMG_START tokens
             img_end_token_ids (list[int]): pre-tokenized IMG_END tokens
             img_context_token_ids (list[int]): pre-tokenized IMG_CONTEXT tokens
@@ -1472,7 +1472,7 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings:
             return []
 
         # # The result multimodal_embeddings is tuple of tensors, with each
-        # tensor correspoending to a multimodal data item (image or video).
+        # tensor corresponding to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         # NOTE: It is important to iterate over the keys in this dictionary
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 92dcf5ea5700..ffba6c9dfe73 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -26,12 +26,11 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -150,8 +149,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -181,7 +178,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
         self.max_position_embeddings = max_position_embeddings
 
@@ -206,8 +202,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
@@ -243,14 +238,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -264,8 +251,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
@@ -454,7 +439,6 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py
index f7e0caf410e1..baeb901bbb05 100644
--- a/vllm/model_executor/models/nemotron_h.py
+++ b/vllm/model_executor/models/nemotron_h.py
@@ -376,8 +376,7 @@ def forward(
         else:
             hidden_states, residual = self.norm(hidden_states, residual)
 
-        output = torch.empty_like(hidden_states)
-        self.mixer(hidden_states, output)
+        output = self.mixer(hidden_states)
         return output, residual
 
 
@@ -714,7 +713,6 @@ class NemotronHForCausalLM(
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     @classmethod
     def get_mamba_state_dtype_from_config(
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index b839206a3094..9d968dee8711 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -26,13 +26,12 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.attention import AttentionType
+from vllm.attention.backends.abstract import AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
@@ -82,8 +81,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -97,8 +94,6 @@ def __init__(
             hidden_size,
             num_heads,
             num_kv_heads,
-            rope_theta,
-            rope_scaling,
             max_position_embeddings,
             quant_config,
             bias,
@@ -111,7 +106,6 @@ def __init__(
     def _init_rotary_emb(
         self,
         config,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         # Enables YARN for Mistral and LLaMA4 derivatives.
@@ -126,8 +120,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
             partial_rotary_factor=self.partial_rotary_factor,
         )
@@ -148,14 +141,6 @@ def __init__(
         self._is_no_op_ffn = block_config.ffn.no_op
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -176,8 +161,6 @@ def __init__(
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
                 num_kv_heads=num_kv_heads,
-                rope_theta=rope_theta,
-                rope_scaling=rope_scaling,
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
@@ -404,7 +387,6 @@ class DeciLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, HasNoOps):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     # Mistral/Llama models can also be loaded with --load-format mistral
     # from consolidated.safetensors checkpoints
diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py
index 5a1dda8aac2c..a57668b21fb8 100644
--- a/vllm/model_executor/models/nemotron_vl.py
+++ b/vllm/model_executor/models/nemotron_vl.py
@@ -34,8 +34,8 @@
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.processing import PromptUpdateDetails
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_image_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .interfaces import (
     MultiModalEmbeddings,
@@ -203,7 +203,7 @@ class NemotronVLProcessor(InternVLProcessor):
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         image_processor: BaseImageProcessorFast,
         *,
         min_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 487e3f671a45..3bbb4dd24226 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers import OlmoConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -87,7 +87,6 @@ def __init__(
         self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.clip_qkv = config.clip_qkv
 
         # Attention input projection. Projects x -> (q, k, v)
@@ -105,7 +104,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.scaling = self.head_dim**-0.5
         self.attn = Attention(
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index 045582c889ee..88e9c2d8541a 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -32,7 +32,7 @@
 from torch import nn
 from transformers import Olmo2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -99,7 +99,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.max_position_embeddings = self.config.max_position_embeddings
-        self.rope_theta = self.config.rope_theta
 
         # Attention input projection. Projects x -> (q, k, v)
         self.qkv_proj = QKVParallelLinear(
@@ -139,15 +138,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=f"{prefix}.attn",
         )
 
-        # Rotary embeddings. Rope scaling is only applied on full attention
-        # layers.
-        self.rope_scaling = self.config.rope_scaling if sliding_window is None else None
+        # Rotary embeddings. Rope scaling is only applied on full attention layers.
+        if sliding_window is None:
+            rope_parameters = self.config.rope_parameters
+        else:
+            rope_theta = self.config.rope_parameters["rope_theta"]
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
         self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,  # type: ignore
-            rope_scaling=self.rope_scaling,
+            rope_parameters=rope_parameters,
         )
 
         # Attention output projection.
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 499eb05de76e..1376583a9972 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -21,7 +21,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import (
@@ -123,8 +123,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 4096)
 
         num_heads = config.num_attention_heads
@@ -148,7 +146,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -176,8 +173,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/opencua.py b/vllm/model_executor/models/opencua.py
new file mode 100644
index 000000000000..433891866337
--- /dev/null
+++ b/vllm/model_executor/models/opencua.py
@@ -0,0 +1,271 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+#
+# Adapted from Qwen2.5-VL implementation
+# Copyright 2025 The vLLM team.
+# Copyright 2025 XLANG Lab, The University of Hong Kong
+
+"""Inference-only OpenCUA-7B model compatible with HuggingFace weights."""
+
+from collections.abc import Mapping, Sequence
+from typing import Any
+
+import torch
+import torch.nn as nn
+from transformers import BatchFeature
+from transformers.models.qwen2_vl import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    Qwen2VLVideoProcessor,
+)
+
+from vllm.config import VllmConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import (
+    MultiModalFieldConfig,
+    MultiModalKwargs,
+)
+from vllm.multimodal.parse import MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.processing import (
+    BaseMultiModalProcessor,
+    PromptReplacement,
+    PromptUpdate,
+)
+from vllm.tokenizers import TokenizerLike
+
+from .qwen2_5_vl import (
+    Qwen2_5_VisionTransformer as OpenCUAVisionTransformer,
+)
+from .qwen2_5_vl import (
+    Qwen2_5_VLForConditionalGeneration,
+)
+from .qwen2_vl import (
+    Qwen2VLDummyInputsBuilder,
+    Qwen2VLMultiModalDataParser,
+    Qwen2VLProcessingInfo,
+    _create_qwen2vl_field_factory,
+)
+from .utils import (
+    WeightsMapper,
+    init_vllm_registered_model,
+    maybe_prefix,
+)
+
+
+class OpenCUAProcessingInfo(Qwen2VLProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config()
+
+    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
+        return {"image": None}
+
+    def get_hf_processor(self, **kwargs: object):
+        """Load OpenCUA processor."""
+        tokenizer = self.get_tokenizer()
+        vision_config = self.ctx.get_hf_image_processor_config()
+        return OpenCUAProcessor(
+            vision_config=vision_config,
+            tokenizer=tokenizer,
+            **kwargs,
+        )
+
+
+class OpenCUAProcessor(Qwen2VLProcessor):
+    def check_argument_for_proper_class(self, attribute_name: str, arg: object) -> None:
+        if attribute_name == "tokenizer":
+            return
+        return super().check_argument_for_proper_class(attribute_name, arg)
+
+    def __init__(
+        self,
+        vision_config: dict,
+        tokenizer: TokenizerLike,
+        **kwargs,
+    ):
+        image_processor = Qwen2VLImageProcessor(**vision_config)
+        video_processor = Qwen2VLVideoProcessor(**vision_config)
+        chat_template = kwargs.pop("chat_template", None)
+
+        super().__init__(
+            image_processor=image_processor,
+            tokenizer=tokenizer,
+            video_processor=video_processor,
+            chat_template=chat_template,
+            **kwargs,
+        )
+
+        self.image_token = "<|media_placeholder|>"
+
+    def __call__(
+        self,
+        text=None,
+        images=None,
+        return_tensors=None,
+        **kwargs,
+    ):
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+            text_inputs = self.tokenizer(text, **kwargs)
+        else:
+            text_inputs = {}
+
+        image_inputs = {}
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+            if len(images) > 0:
+                image_inputs = self.image_processor(
+                    images, return_tensors=return_tensors or "pt"
+                )
+
+        combined_inputs = {**text_inputs, **image_inputs}
+
+        return BatchFeature(combined_inputs, tensor_type=return_tensors)
+
+
+class OpenCUAMultiModalProcessor(BaseMultiModalProcessor[OpenCUAProcessingInfo]):
+    def _get_data_parser(self) -> MultiModalDataParser:
+        return Qwen2VLMultiModalDataParser(
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return _create_qwen2vl_field_factory(
+            self.info.get_hf_config().vision_config.spatial_merge_size
+        )(hf_inputs)
+
+    def _hf_processor_applies_updates(
+        self,
+        prompt_text: str,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        tokenization_kwargs: Mapping[str, object],
+    ) -> bool:
+        """vLLM이 prompt 업데이트를 처리하도록 False 반환."""
+        return False
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, Any],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+        tokenizer = self.info.get_tokenizer()
+        vocab = tokenizer.get_vocab()
+        hf_config = self.info.get_hf_config()
+
+        image_token_str = getattr(hf_processor, "image_token", "<|media_placeholder|>")
+        image_token_id = vocab.get(
+            image_token_str,
+            getattr(hf_config, "media_placeholder_token_id", 151664),
+        )
+
+        merge_length = image_processor.merge_size**2
+
+        def get_replacement_opencua(item_idx: int):
+            out_item = out_mm_kwargs["image"][item_idx]
+            grid_thw = out_item["image_grid_thw"].data
+            assert isinstance(grid_thw, torch.Tensor)
+
+            num_tokens = int(grid_thw.prod()) // merge_length
+            return [image_token_id] * num_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement_opencua,
+            )
+        ]
+
+
+class OpenCUADummyInputsBuilder(Qwen2VLDummyInputsBuilder):
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        image_token = "<|media_placeholder|>"
+
+        return image_token * num_images
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    OpenCUAMultiModalProcessor,
+    info=OpenCUAProcessingInfo,
+    dummy_inputs=OpenCUADummyInputsBuilder,
+)
+class OpenCUAForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
+    merge_by_field_config = True
+    multimodal_cpu_fields = {"image_grid_thw"}
+
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    hf_to_vllm_mapper = WeightsMapper(
+        orig_to_new_prefix={
+            "model.language_model.": "language_model.model.",
+            "model.visual.": "visual.",
+            "vision_tower.": "visual.",
+            "lm_head.": "language_model.lm_head.",
+            "model.": "language_model.model.",
+        }
+    )
+
+    supports_encoder_tp_data = True
+
+    @classmethod
+    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
+        if modality.startswith("image"):
+            return "<|media_placeholder|>"
+        raise ValueError("Only image modality is supported")
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        nn.Module.__init__(self)
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+        self.config = config
+        self.vllm_config = vllm_config
+        self.multimodal_config = multimodal_config
+        self.quant_config = quant_config
+        self.is_multimodal_pruning_enabled = (
+            multimodal_config.is_multimodal_pruning_enabled()
+        )
+
+        if multimodal_config.get_limit_per_prompt("image"):
+            attn_backend_override = (
+                multimodal_config.mm_encoder_attn_backend
+                if multimodal_config is not None
+                else None
+            )
+            self.visual = OpenCUAVisionTransformer(
+                vision_config=config.vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+                quant_config=self.quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+                use_data_parallel=self.use_data_parallel,
+                attn_backend_override=attn_backend_override,
+            )
+        else:
+            self.visual = None
+
+        self.language_model = init_vllm_registered_model(
+            vllm_config=vllm_config,
+            hf_config=config.text_config,
+            prefix=maybe_prefix(prefix, "language_model"),
+            architectures=["Qwen2ForCausalLM"],
+        )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
diff --git a/vllm/model_executor/models/openpangu.py b/vllm/model_executor/models/openpangu.py
index d13a745beffe..bddd9fa50957 100644
--- a/vllm/model_executor/models/openpangu.py
+++ b/vllm/model_executor/models/openpangu.py
@@ -29,7 +29,8 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ParallelConfig, VllmConfig
 from vllm.distributed import (
@@ -77,6 +78,7 @@
     sequence_parallel_chunk,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 
 def check_ffn_act_fn(act_fn: str):
@@ -259,7 +261,6 @@ def __init__(
         v_head_dim: int,
         q_lora_rank: int | None,
         kv_lora_rank: int,
-        rope_theta: float = 10000,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -274,8 +275,6 @@ def __init__(
         self.v_head_dim = v_head_dim
         self.q_lora_rank = q_lora_rank
         self.kv_lora_rank = kv_lora_rank
-        self.rope_theta = rope_theta
-
         self.tp_size = get_tensor_model_parallel_world_size()
         if num_heads % self.tp_size != 0:
             raise ValueError(
@@ -339,7 +338,9 @@ def __init__(
         )
 
         # TODO: remove hard coding
-        rope_scaling = {
+        set_default_rope_theta(config, default_theta=10000)
+        rope_parameters = {
+            "rope_theta": config.rope_parameters["rope_theta"],
             "beta_fast": 32,
             "beta_slow": 1,
             "factor": 1,
@@ -353,8 +354,7 @@ def __init__(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             is_neox_style=False,
         )
 
@@ -407,8 +407,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -454,7 +452,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -475,9 +472,7 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self._init_rotary_emb(
-            config, rope_scaling=rope_scaling, quant_config=quant_config
-        )
+        self._init_rotary_emb(config, quant_config=quant_config)
 
         if hasattr(config, "interleaved_sliding_window"):
             interleaved_sliding_window = config.interleaved_sliding_window
@@ -521,7 +516,6 @@ def forward(
     def _init_rotary_emb(
         self,
         config: PretrainedConfig,
-        rope_scaling: dict[str, Any] | None,
         quant_config: QuantizationConfig | None,
     ) -> None:
         is_neox_style = True
@@ -533,8 +527,7 @@ def _init_rotary_emb(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             is_neox_style=is_neox_style,
         )
 
@@ -555,7 +548,6 @@ def __init__(
         parallel_config = vllm_config.parallel_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
 
         layer_idx = int(prefix.split(sep=".")[-1])
@@ -579,7 +571,6 @@ def __init__(
                     config.q_lora_rank if hasattr(config, "q_lora_rank") else None
                 ),
                 kv_lora_rank=config.kv_lora_rank,
-                rope_theta=rope_theta,
                 max_position_embeddings=max_position_embeddings,
                 cache_config=cache_config,
                 quant_config=quant_config,
@@ -607,8 +598,6 @@ def __init__(
                 num_kv_heads=getattr(
                     config, "num_key_value_heads", config.num_attention_heads
                 ),
-                rope_theta=rope_theta,
-                rope_scaling=getattr(config, "rope_scaling", None),
                 max_position_embeddings=max_position_embeddings,
                 quant_config=quant_config,
                 bias=attention_bias,
@@ -637,7 +626,7 @@ def __init__(
                 bias=getattr(config, "mlp_bias", False),
                 prefix=f"{prefix}.mlp",
             )
-        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", None)
+        self.routed_scaling_factor = getattr(config, "routed_scaling_factor", 1.0)
         self.num_hidden_layers = config.num_hidden_layers
         self.first_k_dense_replace = getattr(
             config, "first_k_dense_replace", self.num_hidden_layers
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 5df700d1a2e1..bba5291ea5ef 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -27,7 +27,7 @@
 from torch import nn
 from transformers import OPTConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 859cd2cecf89..544a44ed5468 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -15,7 +15,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -88,8 +88,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -115,7 +114,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -139,8 +137,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -175,15 +172,12 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         self.self_attn = OrionAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py
index 9db6c317c26a..dcae92ed2088 100644
--- a/vllm/model_executor/models/ouro.py
+++ b/vllm/model_executor/models/ouro.py
@@ -33,7 +33,8 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -112,10 +113,8 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -140,7 +139,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         # Get total_ut_steps from config, default to 4 if not specified
@@ -170,8 +168,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = nn.ModuleList()
@@ -226,9 +223,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -244,10 +238,8 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py
index 183f458658aa..5256d8ba7fd8 100644
--- a/vllm/model_executor/models/paddleocr_vl.py
+++ b/vllm/model_executor/models/paddleocr_vl.py
@@ -33,18 +33,17 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -232,8 +231,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
         # Find factors of max_num_tokens close to its square root
         # to create a dummy image with a reasonable aspect ratio.
         h_patches = int(math.sqrt(max_num_tokens))
-        while max_num_tokens % h_patches != 0:
-            h_patches -= 1
+        max_num_tokens -= max_num_tokens % h_patches
         w_patches = max_num_tokens // h_patches
         return ImageSize(height=h_patches * factor, width=w_patches * factor)
 
@@ -420,7 +418,7 @@ def __init__(self, config: PretrainedConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -583,7 +581,6 @@ def __init__(
         prefix: str = "",
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         attn_backend_override: AttentionBackendEnum | None = None,
-        use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
 
@@ -613,11 +610,9 @@ def __init__(
         )
 
         self.attn_backend = attn_backend
-        self.use_upstream_fa = use_upstream_fa
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -657,7 +652,6 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         batch_size, _, _ = hidden_states.shape
 
@@ -682,7 +676,6 @@ def forward(
                 max_seqlen,
                 batch_size,
                 self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-                self.use_upstream_fa,
             )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             outputs = []
@@ -703,10 +696,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            if seqlens is None:
-                raise ValueError("xFormers attention backend requires seqlens tensor.")
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
         else:
             raise RuntimeError(
                 f"PaddleOCR-VL does not support {self.attn_backend} backend now."
@@ -789,7 +778,6 @@ def __init__(
         *,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
         attn_backend_override: AttentionBackendEnum | None = None,
-        use_upstream_fa: bool = False,
     ):
         super().__init__()
         self.embed_dim = config.hidden_size
@@ -802,7 +790,6 @@ def __init__(
             prefix=f"{prefix}.self_attn",
             attn_backend=attn_backend,
             attn_backend_override=attn_backend_override,
-            use_upstream_fa=use_upstream_fa,
         )
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(
@@ -818,7 +805,6 @@ def forward(
         cu_seqlens: torch.Tensor,
         rotary_pos_emb: torch.Tensor | None,
         max_seqlen: torch.Tensor | None,
-        seqlens: torch.Tensor | None,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -828,7 +814,6 @@ def forward(
             cu_seqlens=cu_seqlens,
             rotary_pos_emb=rotary_pos_emb,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         hidden_states = residual + hidden_states
@@ -860,17 +845,9 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
-        if self.attn_backend not in {
-            AttentionBackendEnum.FLASH_ATTN,
-            AttentionBackendEnum.ROCM_AITER_FA,
-        } and check_upstream_fa_availability(torch.get_default_dtype()):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
-            self.use_upstream_fa = True
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -884,7 +861,6 @@ def __init__(
                     prefix=f"{prefix}.layers.{layer_idx}",
                     attn_backend=self.attn_backend,
                     attn_backend_override=attn_backend_override,
-                    use_upstream_fa=self.use_upstream_fa,
                 )
                 for layer_idx in range(config.num_hidden_layers)
             ]
@@ -943,14 +919,11 @@ def forward(
             cu_seqlens = cu_seqlens.to(device=device)
 
         max_seqlen = None
-        seqlens = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
 
         hidden_states = inputs_embeds
         for encoder_layer in self.layers:
@@ -959,7 +932,6 @@ def forward(
                 cu_seqlens=cu_seqlens,
                 rotary_pos_emb=rotary_pos_emb,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
         return hidden_states
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 3bf6a1d9763d..795cd25f1675 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -30,7 +30,7 @@
 from torch import nn
 from transformers import PersimmonConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -106,7 +106,6 @@ def __init__(
         self.num_heads = self.total_num_heads // tensor_parallel_world_size
         self.head_dim = self.hidden_size // self.total_num_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
         self.is_causal = True
 
@@ -138,7 +137,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=self.rope_theta,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.scaling = self.head_dim**-0.5
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 8fee53c23fb4..70016d9ed246 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -45,7 +45,7 @@
 from torch import nn
 from transformers import PhiConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -115,16 +115,12 @@ def __init__(
         )
         assert rotary_dim % 2 == 0
 
-        # pylint: disable=C0301
-        # Refer to:
-        # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
-        rope_theta = getattr(config, "rope_theta", 10000.0)
         max_position_embeddings = getattr(config, "max_position_embeddings", 2048)
         self.rotary_emb = get_rope(
             self.head_size,
             rotary_dim=rotary_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 92fd858b608b..49530776f890 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -31,7 +31,7 @@
 from torch import nn
 from transformers.configuration_utils import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -86,7 +86,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
-        rope_theta=1e6,
+        rope_parameters=None,
         sliding_window=None,
         attention_dropout=0.0,
         num_experts_per_tok=2,
@@ -119,7 +119,9 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        if rope_parameters is None:
+            rope_theta = kwargs.pop("rope_theta", 1e6)
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
         self.attention_dropout = attention_dropout
 
         self.num_experts_per_tok = num_experts_per_tok
@@ -302,12 +304,11 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict,
         head_dim: int | None = None,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: dict | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
@@ -332,8 +333,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -355,9 +354,8 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=int(self.rope_theta),
+            rope_parameters=rope_parameters,
             is_neox_style=True,
-            rope_scaling=self.rope_scaling,
         )
         self.attn = Attention(
             self.num_heads,
@@ -393,7 +391,6 @@ def __init__(
         super().__init__()
         self.hidden_size = config.hidden_size
         # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = PhiMoEAttention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -402,10 +399,9 @@ def __init__(
             head_dim=getattr(
                 config, "head_dim", self.hidden_size // config.num_attention_heads
             ),
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=config.rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
         )
         self.block_sparse_moe = PhiMoE(
@@ -621,7 +617,6 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8cb7d6a889da..54bde75cc013 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -31,6 +31,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     MergedColumnParallelLinear,
@@ -58,10 +59,8 @@
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
-    cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -73,6 +72,7 @@
 )
 
 try:
+    # Note: vLLM does not install xformers by default.
     from xformers import ops as xops
 
     if current_platform.is_cuda() and current_platform.has_device_capability(100):
@@ -398,21 +398,30 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             prefix=maybe_prefix(prefix, "language_model"),
         )
 
-        self.vision_encoder = VisionTransformer(self.vision_args)
-
-        if self.vision_args.add_pre_mm_projector_layer_norm:
-            self.pre_mm_projector_norm = RMSNorm(self.vision_args.hidden_size, eps=1e-5)
-
-        if self.vision_args.mm_projector_id == PATCH_MERGE:
-            self.patch_merger = PatchMerger(
-                vision_encoder_dim=self.vision_args.hidden_size,
-                spatial_merge_size=self.vision_args.spatial_merge_size,
-                use_mlp_bias=False,
+        if multimodal_config.get_limit_per_prompt("image"):
+            self.vision_encoder = VisionTransformer(self.vision_args)
+            self.pre_mm_projector_norm = (
+                RMSNorm(self.vision_args.hidden_size, eps=1e-5)
+                if self.vision_args.add_pre_mm_projector_layer_norm
+                else None
             )
-
-        self.vision_language_adapter = VisionLanguageAdapter(
-            self.vision_args, dim=config.text_config.hidden_size
-        )
+            self.patch_merger = (
+                PatchMerger(
+                    vision_encoder_dim=self.vision_args.hidden_size,
+                    spatial_merge_size=self.vision_args.spatial_merge_size,
+                    use_mlp_bias=False,
+                )
+                if self.vision_args.mm_projector_id == PATCH_MERGE
+                else None
+            )
+            self.vision_language_adapter = VisionLanguageAdapter(
+                self.vision_args, dim=config.text_config.hidden_size
+            )
+        else:
+            self.vision_encoder = None
+            self.pre_mm_projector_norm = None
+            self.patch_merger = None
+            self.vision_language_adapter = None
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors
@@ -434,13 +443,17 @@ def _process_image_input(
         self,
         image_input: PixtralImagePixelInputs,
     ) -> tuple[torch.Tensor, ...]:
+        assert (
+            self.vision_encoder is not None and self.vision_language_adapter is not None
+        )
+
         images = image_input["images"]
         image_features = self.vision_encoder(images)
         feature_sizes = [image_feature.shape[0] for image_feature in image_features]
         image_features = torch.cat(image_features)
-        if self.vision_args.add_pre_mm_projector_layer_norm:
+        if self.pre_mm_projector_norm is not None:
             image_features = self.pre_mm_projector_norm(image_features)
-        if self.vision_args.mm_projector_id == PATCH_MERGE:
+        if self.patch_merger is not None:
             patch_size = self.vision_args.patch_size
             spatial_merge_size_square = self.vision_args.spatial_merge_size**2
             img_patch_dims = [
@@ -506,41 +519,57 @@ def is_pre_mm_projector_norm(weight: tuple[str, torch.Tensor]):
             return weight[0].startswith("pre_mm_projector_norm")
 
         # Get references to parameters for direct loading
-        vision_encoder_dict = dict(self.vision_encoder.named_parameters())
+        vision_encoder_dict = (
+            dict(self.vision_encoder.named_parameters())
+            if self.vision_encoder is not None
+            else {}
+        )
         patch_merger_dict = (
             dict(self.patch_merger.named_parameters())
-            if self.vision_args.mm_projector_id == PATCH_MERGE
-            else dict()
+            if self.patch_merger is not None
+            else {}
         )
         pre_mm_projector_norm_dict = (
             dict(self.pre_mm_projector_norm.named_parameters())
-            if self.vision_args.add_pre_mm_projector_layer_norm
-            else dict()
+            if self.pre_mm_projector_norm is not None
+            else {}
+        )
+        vision_lang_adapter_dict = (
+            dict(self.vision_language_adapter.named_parameters())
+            if self.vision_language_adapter is not None
+            else {}
         )
-        vision_lang_adapter_dict = dict(self.vision_language_adapter.named_parameters())
 
         def llm_weights_generator():
             # Single pass over weights
             for name, w in weights:
                 if is_vision_encoder_weights((name, w)):
+                    if self.vision_encoder is None:
+                        continue
                     # Load vision encoder weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_encoder_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_patch_merger((name, w)):
+                    if self.patch_merger is None:
+                        continue
                     # Load vision patch merger weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = patch_merger_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_pre_mm_projector_norm((name, w)):
+                    if self.pre_mm_projector_norm is None:
+                        continue
                     # Load vision pre_mm_projector_norm weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = pre_mm_projector_norm_dict[trimmed_name]
                     with torch.no_grad():
                         default_weight_loader(param, w)
                 elif is_vision_lang_adapter_weights((name, w)):
+                    if self.vision_language_adapter is None:
+                        continue
                     # Load vision-language adapter weights directly
                     trimmed_name = ".".join(name.split(".")[1:])
                     param = vision_lang_adapter_dict[trimmed_name]
@@ -747,7 +776,7 @@ class VisionTransformer(nn.Module):
     def __init__(self, args: VisionEncoderArgs):
         super().__init__()
         self.args = args
-        self.patch_conv = nn.Conv2d(
+        self.patch_conv = Conv2dLayer(
             in_channels=args.num_channels,
             out_channels=args.hidden_size,
             kernel_size=args.patch_size,
@@ -1212,7 +1241,7 @@ def __init__(
 
         self.config = config
 
-        self.patch_conv = nn.Conv2d(
+        self.patch_conv = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=config.hidden_size,
             kernel_size=config.patch_size,
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py
index 0c87f5000ff4..472de5590dcf 100644
--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -4,10 +4,6 @@
 
 from collections.abc import Iterable
 from itertools import islice
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionBackend
 
 import torch
 from torch import nn
@@ -294,7 +290,6 @@ def forward_cuda(
         has_decode = num_decodes > 0
         num_actual_tokens = num_prefill_tokens + num_decodes
 
-        # NOTE: V0 put prefill before decode, v1 puts decode before prefill
         # Separate prefill and decode by splitting varlen input
         # Split along token dimension
         hidden_states_d, hidden_states_p = torch.split(
@@ -467,11 +462,6 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]:
     def mamba_type(self) -> str:
         return "mamba2"
 
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.mamba2_attn import Mamba2AttentionBackend
-
-        return Mamba2AttentionBackend
-
 
 def plamo2_mamba_mixer(
     hidden_states: torch.Tensor,
@@ -576,10 +566,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No
             prefix=f"{prefix}.o_proj",
         )
 
-        self.rope_theta = config.rope_theta if hasattr(config, "rope_theta") else 10000
-        self.rope_scaling = (
-            config.rope_scaling if hasattr(config, "rope_scaling") else None
-        )
         max_position = config.max_position_embeddings
         if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
             vllm_config.model_config.max_model_len, int
@@ -590,8 +576,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> No
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=self.rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.q_norm = RMSNorm(config.hidden_size_per_head, eps=config.rms_norm_eps)
         self.q_norm.weight = torch.nn.Parameter(
diff --git a/vllm/model_executor/models/plamo3.py b/vllm/model_executor/models/plamo3.py
new file mode 100644
index 000000000000..4aeb9d432dcc
--- /dev/null
+++ b/vllm/model_executor/models/plamo3.py
@@ -0,0 +1,441 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Inference-only PLaMo3 model."""
+
+from collections.abc import Iterable
+from itertools import islice
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.attention.layer import Attention
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    LoaderFunction,
+    composed_weight_loader,
+    default_weight_loader,
+)
+from vllm.model_executor.models.interfaces import SupportsPP
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    extract_layer_index,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.model_executor.utils import set_weight_attrs
+from vllm.sequence import IntermediateTensors
+
+
+# Only used for type hinting.
+class Plamo3Config(PretrainedConfig):  # type: ignore
+    model_type: str = "plamo3"
+
+    hidden_size: int
+    num_hidden_layers: int
+    rms_norm_eps: float
+    # Attention
+    num_attention_heads: int
+    head_dim: int
+    num_key_value_heads: int
+    # vllm rename `sliding_window` attr to `interleaved_sliding_window`
+    # if `sliding_window` is list
+    interleaved_sliding_window: list[int | None]
+    sliding_window_pattern: int
+    rope_parameters: dict[str, Any]
+    rope_local_theta: int
+    # MLP
+    intermediate_size: int
+    # Tokenizer
+    vocab_size: int
+
+
+def rms_norm_weight_loader(offset: float) -> LoaderFunction:
+    return composed_weight_loader(
+        default_weight_loader,
+        lambda x: x + offset,
+    )
+
+
+class DenseMLP(nn.Module):
+    def __init__(
+        self,
+        config: Plamo3Config,
+        quant_config: QuantizationConfig | None = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            [self.intermediate_size] * 2,
+            bias=False,
+            prefix=f"{prefix}.gate_up_proj",
+            quant_config=quant_config,
+            return_bias=False,
+        )
+        self.act = SiluAndMul()
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            prefix=f"{prefix}.down_proj",
+            quant_config=quant_config,
+            return_bias=False,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        h = self.gate_up_proj(hidden_states)
+        h = self.act(h)
+        return self.down_proj(h)
+
+
+class Plamo3AttentionMixer(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", **kwargs) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        layer_idx = extract_layer_index(prefix)
+        layer_type = config.layer_types[layer_idx]
+        is_sliding = layer_type == "sliding_attention"
+
+        # Initialize the rotary embedding.
+        if layer_type in config.rope_parameters:
+            # Transformers v5 rope config.
+            rope_parameters = config.rope_parameters[layer_type]
+        else:
+            # Transformers v4 rope config.
+            # Global attention. Use the values in config.json.
+            rope_parameters = config.rope_parameters
+            # Local attention. Override the values in config.json.
+            if is_sliding:
+                rope_parameters = dict(
+                    rope_type="default", rope_theta=config.rope_local_theta
+                )
+        max_position = config.max_position_embeddings
+        if hasattr(vllm_config.model_config, "max_model_len") and isinstance(
+            vllm_config.model_config.max_model_len, int
+        ):
+            max_position = min(max_position, vllm_config.model_config.max_model_len)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            rope_parameters=rope_parameters,
+        )
+        self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.q_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)}
+        )
+        self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.k_norm.weight, {"weight_loader": rms_norm_weight_loader(offset=1.0)}
+        )
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=vllm_config.cache_config,
+            per_layer_sliding_window=config.interleaved_sliding_window[layer_idx],
+            prefix=f"{prefix}.attn",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+
+        q_shape = q.shape
+        q = q.reshape(q_shape[:-1] + (q_shape[-1] // self.head_dim, self.head_dim))
+        q = self.q_norm.forward_native(q).reshape(q_shape)
+        k_shape = k.shape
+        k = k.reshape(k_shape[:-1] + (k_shape[-1] // self.head_dim, self.head_dim))
+        k = self.k_norm.forward_native(k).reshape(k_shape)
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class Plamo3DecoderLayer(nn.Module):
+    def __init__(
+        self, vllm_config: VllmConfig, prefix: str = "", **kwargs: Any
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+
+        self.mixer = Plamo3AttentionMixer(
+            vllm_config=vllm_config,
+            prefix=f"{prefix}.mixer",
+        )
+
+        self.mlp = DenseMLP(
+            config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
+        )
+        self.pre_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.pre_mixer_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+        self.post_mixer_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.post_mixer_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0 / 5)},
+        )
+        self.pre_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.pre_mlp_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+        self.post_mlp_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.post_mlp_norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0 / (5**1.5))},
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        **kwargs: Any,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.pre_mixer_norm(hidden_states)
+        else:
+            hidden_states, residual = self.pre_mixer_norm(hidden_states, residual)
+
+        hidden_states = self.mixer(
+            positions=positions, hidden_states=hidden_states, residual=residual
+        )
+        hidden_states = self.post_mixer_norm(hidden_states)
+        # Fully Connected
+        hidden_states, residual = self.pre_mlp_norm(hidden_states, residual)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_mlp_norm(hidden_states)
+        return hidden_states, residual
+
+
+class Plamo3Decoder(torch.nn.Module):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        num_hidden_layers = vllm_config.model_config.hf_config.num_hidden_layers
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            num_hidden_layers,
+            lambda prefix: Plamo3DecoderLayer(vllm_config, prefix=prefix),
+            prefix=f"{prefix}.layers",
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        for layer in islice(self.layers, self.start_layer, self.end_layer):
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+        return hidden_states, residual
+
+
+@support_torch_compile
+class Plamo3Model(nn.Module):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.org_vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            prefix=f"{prefix}.embed_tokens",
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+        self.layers = Plamo3Decoder(vllm_config, prefix=f"{prefix}.layers")
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        set_weight_attrs(
+            self.norm.weight,
+            {"weight_loader": rms_norm_weight_loader(offset=1.0)},
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        hidden_states, residual = self.layers(
+            positions=positions, hidden_states=hidden_states, residual=residual
+        )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+class Plamo3ForCausalLM(nn.Module, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
+        super().__init__()
+        self.config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.scheduler_config = vllm_config.scheduler_config
+
+        self.model = Plamo3Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        self.vocab_size = self.config.vocab_size
+        self.unpadded_vocab_size = self.config.vocab_size
+
+        num_embeddings = ((self.vocab_size + 15) // 16) * 16
+        self.lm_head = ParallelLMHead(
+            num_embeddings,
+            self.config.hidden_size,
+            org_num_embeddings=self.config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            prefix=f"{prefix}.lm_head",
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+
+        self.logits_processor = LogitsProcessor(
+            self.unpadded_vocab_size, self.config.vocab_size
+        )
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        logits = self.logits_processor(self.lm_head, hidden_states)
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None),
+        )
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 50a125c3f597..12285cf9c196 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -16,7 +16,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -83,8 +83,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         max_position_embeddings: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
@@ -117,8 +116,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -153,14 +151,11 @@ def __init__(
         super().__init__()
         self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon)
 
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         self.attn = QWenAttention(
             config.hidden_size,
             config.num_attention_heads,
             config.max_position_embeddings,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             cache_config=cache_config,
             quant_config=quant_config,
             prefix=f"{prefix}.attn",
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 1bbb969ce5aa..34c31d8deee2 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -33,7 +33,8 @@
 from torch import nn
 from transformers import Qwen2Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -57,7 +58,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.config import is_interleaved
+from vllm.transformers_utils.config import is_interleaved, set_default_rope_theta
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .utils import (
@@ -114,11 +115,10 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict[str, Any],
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -143,7 +143,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(
@@ -167,8 +166,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         attn_cls = (
@@ -216,9 +214,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -237,10 +233,9 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
@@ -280,6 +275,38 @@ def forward(
         return hidden_states, residual
 
 
+def qwen_2_model_invariants(
+    input_ids: torch.Tensor,
+    positions: torch.Tensor,
+    intermediate_tensors: IntermediateTensors | None = None,
+    inputs_embeds: torch.Tensor | None = None,
+):
+    """Shape invariants for Qwen2Model Model, those are translated to
+    runtime assertions for unbacked dynamic shapes and are compiled away for
+    backed"""
+    # All these should be equal.
+    # input_ids.size()[0]
+    # positions.size()[-1]
+    # intermediate_tensors["hidden_states"].size()[0]
+    # inputs_embeds.size()[0]
+    torch._check(input_ids.size()[0] == positions.size()[-1])
+    if intermediate_tensors is not None:
+        torch._check(
+            input_ids.size()[0] == intermediate_tensors["hidden_states"].size()[0]
+        )
+
+    if inputs_embeds is not None:
+        torch._check(input_ids.size()[0] == inputs_embeds.size()[0])
+
+    # Hidden dimensions should match (hidden_size)
+    # intermediate_tensors["hidden_states"].size()[1]
+    # inputs_embeds.size()[1]
+    if inputs_embeds is not None and intermediate_tensors is not None:
+        torch._check(
+            inputs_embeds.size()[1] == intermediate_tensors["hidden_states"].size()[1]
+        )
+
+
 @support_torch_compile(
     dynamic_arg_dims={
         "input_ids": 0,
@@ -288,7 +315,8 @@ def forward(
         "positions": -1,
         "intermediate_tensors": 0,
         "inputs_embeds": 0,
-    }
+    },
+    shape_invariants=qwen_2_model_invariants,
 )
 class Qwen2Model(nn.Module):
     def __init__(
diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py
index 262ea771d9cd..7506ee8656fd 100644
--- a/vllm/model_executor/models/qwen2_5_omni_thinker.py
+++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py
@@ -23,7 +23,6 @@
 """Inference-only Qwen2.5-Omni model (thinker part)."""
 
 from collections.abc import Callable, Iterable, Mapping, Sequence
-from copy import copy
 from functools import partial
 from typing import Annotated, Any, Literal
 
@@ -387,15 +386,6 @@ def _maybe_apply_prompt_updates(
         self._validate_mm_kwargs(mm_kwargs, mm_item_counts)
         self._validate_mm_updates(mm_prompt_updates, mm_item_counts)
 
-        use_audio_in_video = False
-        if "video" in mm_kwargs:
-            video_items = [item for item in mm_kwargs["video"] if item is not None]
-            # only check video items (if there are any)
-            if video_items:
-                use_audio_in_video = all(
-                    item["use_audio_in_video"].data for item in video_items
-                )
-
         if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
@@ -404,7 +394,6 @@ def _maybe_apply_prompt_updates(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
         else:
             prompt_ids, mm_placeholders = self._apply_prompt_updates(
@@ -414,7 +403,6 @@ def _maybe_apply_prompt_updates(
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
-                use_audio_in_video=use_audio_in_video,
             )
 
         return prompt_ids, mm_placeholders
@@ -640,19 +628,6 @@ def _apply_hf_processor_mm_only(
 
         return mm_processed_data
 
-    def _validate_mm_placeholders(
-        self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-        use_audio_in_video: bool = False,
-    ) -> None:
-        if use_audio_in_video:
-            mm_item_counts = copy(mm_item_counts)
-            if "video" in mm_item_counts:
-                assert "audio" in mm_item_counts
-                mm_item_counts["audio"] -= mm_item_counts["video"]
-        super()._validate_mm_placeholders(mm_placeholders, mm_item_counts)
-
 
 class Qwen2_5OmniConditionalGenerationMixin:
     def _parse_and_validate_audio_input(
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
index 23591480b160..6ca490f46763 100644
--- a/vllm/model_executor/models/qwen2_5_vl.py
+++ b/vllm/model_executor/models/qwen2_5_vl.py
@@ -26,7 +26,6 @@
 # limitations under the License.
 """Inference-only Qwen2.5-VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import lru_cache, partial
 from typing import Annotated, Any, Literal, TypeAlias
@@ -47,7 +46,6 @@
 from vllm.attention.ops.vit_attn_wrappers import (
     vit_flash_attn_wrapper,
     vit_torch_sdpa_wrapper,
-    vit_xformers_attn_wrapper,
 )
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -56,15 +54,16 @@
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     MergedColumnParallelLinear,
     QKVParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.vision import should_torch_compile_mm_vit
@@ -110,7 +109,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -231,6 +229,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
         - hidden_size must match the hidden size of language model backbone.
         - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
           format
+        - second_per_grid_ts: The video time interval (in seconds) for each
+          grid along the temporal dimension in the 3D position IDs. Returned
+          when `videos` is not `None`.
     """
 
     type: Literal["video_embeds"]
@@ -245,6 +246,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
         TensorShape("nv", 3),
     ]
 
+    second_per_grid_ts: Annotated[
+        torch.Tensor | None,
+        TensorShape("nv"),
+    ] = None
+
 
 Qwen2_5_VLVideoInputs: TypeAlias = (
     Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -301,7 +307,6 @@ def __init__(
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
         attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
@@ -338,67 +343,56 @@ def __init__(
             disable_tp=use_data_parallel,
         )
         self.attn_backend = attn_backend
-        self.use_upstream_fa = use_upstream_fa
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
-        # On ROCm with FLASH_ATTN backend, upstream flash_attn is used
-        from vllm.platforms import current_platform
 
-        if (
-            current_platform.is_rocm()
-            and self.attn_backend == AttentionBackendEnum.FLASH_ATTN
-        ):
-            self.use_upstream_fa = True
-        if current_platform.is_xpu():
-            self.use_upstream_fa = False
         self.is_flash_attn_backend = self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }
 
-    def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]:
-        # [s, b, 3 * head * head_dim]
-        seq_len, bs, _ = qkv.shape
-
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim]
-        q, k, v = qkv.chunk(3, dim=2)
-
-        # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim]
-        new_shape = (
-            seq_len,
-            bs,
-            self.num_attention_heads_per_partition,
-            self.hidden_size_per_attention_head,
-        )
-        q, k, v = (x.view(*new_shape) for x in (q, k, v))
-        return q, k, v
-
     def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, head * 3 * head_dim]
         x, _ = self.qkv(x)
+        seq_len, batch_size, _ = x.shape
 
-        # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim]
-        q, k, v = self.split_qkv(x)
-        batch_size = q.shape[1]
+        qkv = einops.rearrange(
+            x,
+            "s b (three head head_dim) -> b s three head head_dim",
+            three=3,
+            head=self.num_attention_heads_per_partition,
+        )
 
-        q, k, v = (einops.rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
-        if rotary_pos_emb is not None:
-            # [2 * b, s, heads, head_dim]
-            qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
-            q, k = torch.chunk(qk_rotated, 2, dim=0)
+        if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
+            qk, v = qkv[:, :, :2], qkv[:, :, 2]
+
+            qk_reshaped = einops.rearrange(
+                qk, "b s two head head_dim -> (two b) s head head_dim", two=2
+            )
+            qk_rotated = apply_rotary_pos_emb_vision(
+                qk_reshaped, cos=rotary_pos_emb_cos, sin=rotary_pos_emb_sin
+            )
+            qk_rotated = qk_rotated.view(
+                2,
+                batch_size,
+                seq_len,
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
+            q, k = qk_rotated.unbind(dim=0)
+        else:
+            q, k, v = qkv.unbind(dim=2)
 
         if self.is_flash_attn_backend:
             context_layer = vit_flash_attn_wrapper(
@@ -409,7 +403,6 @@ def forward(
                 max_seqlen,
                 batch_size,
                 self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA,
-                self.use_upstream_fa,
             )
         elif self.attn_backend == AttentionBackendEnum.TORCH_SDPA:
             # Execute attention entry by entry for speed & less VRAM.
@@ -427,8 +420,6 @@ def forward(
                 v,
                 cu_seqlens,
             )
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            context_layer = vit_xformers_attn_wrapper(q, k, v, seqlens)
 
         output, _ = self.proj(context_layer)
         return output
@@ -438,10 +429,9 @@ def forward(
     dynamic_arg_dims={
         "x": 0,
         "cu_seqlens": 0,
-        "rotary_pos_emb": 0,
-        "seqlens": 0,
+        "rotary_pos_emb_cos": 0,
+        "rotary_pos_emb_sin": 0,
     },
-    mark_unbacked_dims={"seqlens": 0},
     enable_if=should_torch_compile_mm_vit,
 )
 class Qwen2_5_VisionBlock(nn.Module):
@@ -456,7 +446,6 @@ def __init__(
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
         attn_backend_override: AttentionBackendEnum | None = None,
     ) -> None:
         super().__init__()
@@ -472,7 +461,6 @@ def __init__(
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa,
             attn_backend_override=attn_backend_override,
         )
         self.mlp = Qwen2_5_VisionMLP(
@@ -489,16 +477,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x_attn = self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
         x_fused_norm, residual = self.norm2(x, residual=x_attn)
         x = residual + self.mlp(x_fused_norm)
@@ -525,15 +513,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=False,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -589,42 +580,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
-class Qwen2_5_VisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (
-            theta ** (torch.arange(0, dim, 2, dtype=torch.float, device="cpu") / dim)
-        )
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Qwen2_5_VisionTransformer(nn.Module):
     def __init__(
         self,
@@ -667,9 +622,13 @@ def __init__(
 
         norm_layer = partial(RMSNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            is_neox_style=True,
+        )
 
-        use_upstream_fa = False
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),
@@ -679,7 +638,6 @@ def __init__(
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -687,7 +645,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -707,7 +664,6 @@ def __init__(
                         prefix=f"{prefix}.blocks.{layer_idx}",
                         use_data_parallel=use_data_parallel,
                         attn_backend=self.attn_backend,
-                        use_upstream_fa=use_upstream_fa,
                         attn_backend_override=attn_backend_override,
                     )
                     for layer_idx in range(depth)
@@ -758,15 +714,25 @@ def rotary_pos_emb_thw(self, t, h, w):
         )
         pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
         max_size = max(h, w)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        rotary_pos_emb = rotary_pos_emb.reshape(
-            rotary_pos_emb.shape[0] // self.spatial_merge_unit,
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+
+        cos_combined = cos_combined.reshape(
+            cos_combined.shape[0] // self.spatial_merge_unit,
+            self.spatial_merge_unit,
+            -1,
+        )
+        sin_combined = sin_combined.reshape(
+            sin_combined.shape[0] // self.spatial_merge_unit,
             self.spatial_merge_unit,
             -1,
         )
 
-        return rotary_pos_emb
+        return cos_combined, sin_combined
 
     def get_window_index_thw(self, grid_t, grid_h, grid_w):
         vit_merger_window_size = (
@@ -808,14 +774,19 @@ def get_window_index_thw(self, grid_t, grid_h, grid_w):
     @lru_cache(maxsize=1024)  # noqa: B019
     def get_rope_by_thw(self, t, h, w):
         window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw(t, h, w)
-        rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w)
-        rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :]
-        rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1)
+        cos_thw, sin_thw = self.rotary_pos_emb_thw(t, h, w)
+
+        cos_thw = cos_thw[window_index_thw, :, :]
+        cos_thw = cos_thw.flatten(start_dim=0, end_dim=1)
+        sin_thw = sin_thw[window_index_thw, :, :]
+        sin_thw = sin_thw.flatten(start_dim=0, end_dim=1)
+
         cu_seqlens_thw = torch.repeat_interleave(
             torch.tensor([h * w], dtype=torch.int32), t
         )
         return (
-            rotary_pos_emb_thw,
+            cos_thw,
+            sin_thw,
             window_index_thw,
             cu_seqlens_window_thw,
             cu_seqlens_thw,
@@ -824,17 +795,14 @@ def get_rope_by_thw(self, t, h, w):
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     @staticmethod
     def invert_permutation(perm: torch.Tensor) -> torch.Tensor:
@@ -850,7 +818,8 @@ def forward(
     ) -> torch.Tensor:
         # patchify
         seq_len, _ = x.size()
-        rotary_pos_emb = []
+        rotary_pos_emb_cos = []
+        rotary_pos_emb_sin = []
         window_index: list = []
         cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)]
         cu_seqlens: list = []
@@ -866,7 +835,8 @@ def forward(
             llm_w = w // self.spatial_merge_size
 
             (
-                rotary_pos_emb_thw,
+                cos_thw,
+                sin_thw,
                 window_index_thw,
                 cu_seqlens_window_thw,
                 cu_seqlens_thw,
@@ -879,11 +849,13 @@ def forward(
             cu_window_seqlens_last = cu_seqlens_window_thw[-1]
             cu_window_seqlens.append(cu_seqlens_window_thw)
 
-            rotary_pos_emb.append(rotary_pos_emb_thw)
+            rotary_pos_emb_cos.append(cos_thw)
+            rotary_pos_emb_sin.append(sin_thw)
 
             cu_seqlens.append(cu_seqlens_thw)
 
-        rotary_pos_emb = torch.cat(rotary_pos_emb)
+        rotary_pos_emb_cos = torch.cat(rotary_pos_emb_cos)
+        rotary_pos_emb_sin = torch.cat(rotary_pos_emb_sin)
         window_index = torch.cat(window_index)
         # compute reverse indices
         reverse_indices = self.invert_permutation(window_index)
@@ -895,14 +867,17 @@ def forward(
 
         # transformers
         # pre-compute seqlens for window/full attn to reduce cuMemcpy operations
-        max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen(cu_seqlens)
-        max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen(
-            cu_window_seqlens
-        )
+        max_seqlen_full = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen_window = self.compute_attn_mask_seqlen(cu_window_seqlens)
 
         cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True)
         cu_window_seqlens = cu_window_seqlens.to(device=self.device, non_blocking=True)
-        rotary_pos_emb = rotary_pos_emb.to(device=self.device, non_blocking=True)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(
+            device=self.device, non_blocking=True
+        )
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(
+            device=self.device, non_blocking=True
+        )
         window_index = window_index.to(device=hidden_states.device, non_blocking=True)
         reverse_indices = reverse_indices.to(
             device=hidden_states.device, non_blocking=True
@@ -920,18 +895,16 @@ def forward(
             if layer_num in self.fullatt_block_indexes:
                 cu_seqlens_now = cu_seqlens
                 max_seqlen_now = max_seqlen_full
-                seqlens_now = seqlens_full
             else:
                 cu_seqlens_now = cu_window_seqlens
                 max_seqlen_now = max_seqlen_window
-                seqlens_now = seqlens_window
 
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens_now,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen_now,
-                seqlens=seqlens_now,
             )
 
         # For Qwen2.5-VL-3B, float16 will overflow at last block
@@ -957,9 +930,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -1314,6 +1284,7 @@ def _parse_and_validate_video_input(
                 type="video_embeds",
                 video_embeds=video_embeds,
                 video_grid_thw=video_grid_thw,
+                second_per_grid_ts=second_per_grid_ts,
             )
 
     def _process_image_input(
@@ -1425,7 +1396,13 @@ def _postprocess_video_embeds_evs(
 
         # Cast to long to match the original code
         # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
-        second_per_grid_ts = video_input["second_per_grid_ts"].long()
+        second_per_grid_ts = video_input.get("second_per_grid_ts")
+        if second_per_grid_ts is None:
+            raise ValueError(
+                "second_per_grid_ts is required when video_pruning_rate > 0 "
+                "is enabled for video inputs, including the video_embeds path."
+            )
+        second_per_grid_ts = second_per_grid_ts.long()
         tokens_per_second = self.config.vision_config.tokens_per_second
 
         video_embeds_out = []
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 2ff0d19df238..5a428740082f 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -34,7 +34,7 @@
 from torch import nn
 from transformers import Qwen2MoeConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -194,8 +194,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
@@ -222,7 +221,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
@@ -248,8 +246,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -291,8 +288,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -301,8 +296,7 @@ def __init__(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             cache_config=cache_config,
             quant_config=quant_config,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 13b54bbe1748..8fbd89622394 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -25,15 +25,15 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Annotated, Any, Literal, TypeAlias
 
+import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from einops import rearrange, repeat
+from einops import rearrange
 from transformers import BatchFeature
 from transformers.models.qwen2_vl import Qwen2VLImageProcessor, Qwen2VLProcessor
 from transformers.models.qwen2_vl.configuration_qwen2_vl import (
@@ -45,7 +45,6 @@
 
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.attention.layer import (
-    check_upstream_fa_availability,
     maybe_get_vit_flash_attn_backend,
 )
 from vllm.config import VllmConfig
@@ -54,13 +53,15 @@
 from vllm.distributed import utils as dist_utils
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import QuickGELU
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.rotary_embedding.common import (
+    apply_rotary_emb_torch,
     dispatch_rotary_emb_function,
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -90,7 +91,7 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import (
@@ -107,7 +108,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -277,47 +277,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
-def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
-    if not interleaved:
-        x1, x2 = x.chunk(2, dim=-1)
-        return torch.cat((-x2, x1), dim=-1)
-    else:
-        x1, x2 = x[..., ::2], x[..., 1::2]
-        return rearrange(
-            torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
-        )
-
-
-def apply_rotary_emb_torch(
-    x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+def apply_rotary_pos_emb_vision(
+    t: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
 ) -> torch.Tensor:
-    """
-    x: (batch_size, seqlen, nheads, headdim)
-    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
-    """
-    ro_dim = cos.shape[-1] * 2
-    assert ro_dim <= x.shape[-1]
-    cos = repeat(
-        cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+    rotary_emb_function = dispatch_rotary_emb_function(
+        default=partial(apply_rotary_emb_torch, is_neox_style=True)
     )
-    sin = repeat(
-        sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
-    )
-    return torch.cat(
-        [
-            x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
-            x[..., ro_dim:],
-        ],
-        dim=-1,
-    )
-
-
-def apply_rotary_pos_emb_vision(t: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
-    rotary_emb_function = dispatch_rotary_emb_function(default=apply_rotary_emb_torch)
-    t_ = t.float()
-    cos = freqs.cos()
-    sin = freqs.sin()
-    output = rotary_emb_function(t_, cos, sin).type_as(t)
+    output = rotary_emb_function(t, cos, sin).type_as(t)
     return output
 
 
@@ -368,12 +334,10 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
@@ -381,7 +345,6 @@ def __init__(
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -414,9 +377,9 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         # [s, b, c] --> [s, b, 3 * head * head_dim]
         x, _ = self.qkv(x)
@@ -426,11 +389,13 @@ def forward(
         batch_size = q.shape[1]
 
         q, k, v = (rearrange(x, "s b ... -> b s ...") for x in (q, k, v))
-        if rotary_pos_emb is not None:
-            # [2 * b, s, heads, head_dim]
-            qk_concat = torch.cat([q, k], dim=0)
-            qk_rotated = apply_rotary_pos_emb_vision(qk_concat, rotary_pos_emb)
-            q, k = torch.chunk(qk_rotated, 2, dim=0)
+
+        # [2 * b, s, heads, head_dim]
+        qk_concat = torch.cat([q, k], dim=0)
+        qk_rotated = apply_rotary_pos_emb_vision(
+            qk_concat, rotary_pos_emb_cos, rotary_pos_emb_sin
+        )
+        q, k = torch.chunk(qk_rotated, 2, dim=0)
 
         if self.is_flash_attn_backend:
             q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
@@ -475,20 +440,6 @@ def forward(
             context_layer = rearrange(
                 context_layer, "b s h d -> s b (h d)"
             ).contiguous()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            attn_bias = BlockDiagonalMask.from_seqlens(
-                q_seqlen=seqlens, kv_seqlen=None, device=q.device
-            )
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None
-            )
-            context_layer = rearrange(
-                context_layer, "b s h d -> s b (h d)"
-            ).contiguous()
 
         output, _ = self.proj(context_layer)
         return output
@@ -536,16 +487,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: int | None = None,  # Only used for Flash Attention
-        seqlens: list[int] | None = None,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -566,15 +517,18 @@ def __init__(
         self.embed_dim = embed_dim
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             embed_dim,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=False,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.embed_dim)
         return x
 
 
@@ -627,40 +581,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
-class Qwen2VisionRotaryEmbedding(nn.Module):
-    def __init__(self, dim: int, theta: float = 10000.0) -> None:
-        super().__init__()
-        self.dim = dim
-        self.theta = theta
-        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self._seq_len_cached = 0
-        self._freqs_cached = None
-
-    def update_freqs_cache(self, seqlen: int) -> None:
-        if seqlen > self._seq_len_cached:
-            seqlen *= 2
-            self._seq_len_cached = seqlen
-            self.inv_freq = 1.0 / (
-                self.theta
-                ** (
-                    torch.arange(
-                        0, self.dim, 2, dtype=torch.float, device=self.inv_freq.device
-                    )
-                    / self.dim
-                )
-            )
-            seq = torch.arange(
-                seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.outer(seq, self.inv_freq)
-            self._freqs_cached = freqs
-
-    def forward(self, seqlen: int) -> torch.Tensor:
-        self.update_freqs_cache(seqlen)
-        return self._freqs_cached[:seqlen]
-
-
 class Qwen2VisionTransformer(nn.Module):
     def __init__(
         self,
@@ -699,7 +619,12 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = embed_dim // num_heads
-        self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            is_neox_style=True,
+        )
 
         self.blocks = nn.ModuleList(
             [
@@ -729,11 +654,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -743,7 +663,9 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
-    def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
+    def rot_pos_emb(
+        self, grid_thw: list[list[int]]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         pos_ids = []
         max_grid_size = 0
         for t, h, w in grid_thw:
@@ -772,22 +694,22 @@ def rot_pos_emb(self, grid_thw: list[list[int]]) -> torch.Tensor:
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
             max_grid_size = max(max_grid_size, h, w)
         pos_ids = torch.cat(pos_ids, dim=0)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
-
-    def compute_attn_mask_seqlen(
-        self, cu_seqlens: torch.Tensor
-    ) -> tuple[int | None, list[int] | None]:
-        max_seqlen, seqlens = None, None
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+        return cos_combined, sin_combined
+
+    def compute_attn_mask_seqlen(self, cu_seqlens: torch.Tensor) -> int | None:
+        max_seqlen = None
         if self.attn_backend in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -800,32 +722,34 @@ def forward(
 
         if isinstance(grid_thw, list):
             grid_thw_list = grid_thw
-            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+            grid_thw = np.array(grid_thw, dtype=np.int32)
         else:
             grid_thw_list = grid_thw.tolist()
+            grid_thw = grid_thw.numpy()
 
         # compute position embedding
-        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
         # compute cu_seqlens
-        cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
-        ).cumsum(dim=0, dtype=torch.int32)
-        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
-        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
+        cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            axis=0, dtype=np.int32
+        )
+        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+        cu_seqlens = torch.from_numpy(cu_seqlens)
 
         # transformers
         x = x.unsqueeze(1)
 
         # pre-compute seqlens for attn mask to reduce cuMemcpy operations
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
+        cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
         for blk in self.blocks:
             x = blk(
                 x,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
 
         # adapter
@@ -844,9 +768,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -1612,7 +1533,7 @@ class Tarsier2Processor(Qwen2VLProcessor):
     def __init__(
         self,
         vision_config: dict,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         **kwargs,
     ):
         self.image_processor = Tarsier2ImageProcessor(**vision_config)
diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py
index 8d7f22a33fe6..7d2b3e5f9bc7 100644
--- a/vllm/model_executor/models/qwen3.py
+++ b/vllm/model_executor/models/qwen3.py
@@ -30,7 +30,8 @@
 from torch import nn
 from transformers import Qwen3Config
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -42,6 +43,7 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsEagle3, SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2MLP as Qwen3MLP
@@ -57,14 +59,13 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
+        rope_parameters: dict,
         max_position: int = 4096 * 32,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
         qkv_bias: bool = False,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
         dual_chunk_attention_config: dict[str, Any] | None = None,
@@ -89,7 +90,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
         self.qkv_proj = QKVParallelLinear(
@@ -113,8 +113,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -166,9 +165,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
         )
@@ -187,13 +184,12 @@ def __init__(
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
             head_dim=getattr(config, "head_dim", None),
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
             dual_chunk_attention_config=dual_chunk_attention_config,
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index 96751fee800b..6f520706a317 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -31,7 +31,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config
 from vllm.distributed import (
@@ -216,8 +216,7 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any],
         max_position_embeddings: int = 8192,
         head_dim: int | None = None,
         rms_norm_eps: float = 1e-06,
@@ -247,7 +246,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
         self.dual_chunk_attention_config = dual_chunk_attention_config
 
@@ -273,8 +271,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
             dual_chunk_attention_config=dual_chunk_attention_config,
         )
         self.attn = Attention(
@@ -326,8 +323,6 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         quant_config = vllm_config.quant_config
 
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         dual_chunk_attention_config = getattr(
             config, "dual_chunk_attention_config", None
@@ -336,8 +331,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             num_kv_heads=config.num_key_value_heads,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             max_position_embeddings=max_position_embeddings,
             rms_norm_eps=config.rms_norm_eps,
             qkv_bias=getattr(config, "attention_bias", False),
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 86508a7c6431..661a182151d7 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -10,7 +10,8 @@
 from torch import nn
 from transformers.activations import ACT2FN
 
-from vllm.attention import Attention, AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import (
     CacheConfig,
@@ -216,12 +217,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 class Qwen3NextGatedDeltaNet(nn.Module, MambaBase):
     @property
     def mamba_type(self) -> str:
-        return "linear_attention"
-
-    def get_attn_backend(self) -> type["AttentionBackend"]:
-        from vllm.v1.attention.backends.gdn_attn import GDNAttentionBackend
-
-        return GDNAttentionBackend
+        return "gdn_attention"
 
     def get_state_dtype(self) -> tuple[torch.dtype, torch.dtype]:
         return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
@@ -753,8 +749,7 @@ def __init__(
             head_size=self.head_dim,
             rotary_dim=self.head_dim,
             max_position=config.max_position_embeddings,
-            base=config.rope_theta,
-            rope_scaling=config.rope_scaling,
+            rope_parameters=config.rope_parameters,
             partial_rotary_factor=config.partial_rotary_factor,
             dual_chunk_attention_config=self.dual_chunk_attention_config,
         )
@@ -1154,8 +1149,8 @@ def set_moe_parameters(self):
                 example_moe = layer.mlp
                 self.moe_layers.append(layer.mlp.experts)
 
-            if example_moe is None:
-                raise RuntimeError("No Qwen3Next layer found in the model.layers.")
+        if example_moe is None:
+            raise RuntimeError("No Qwen3Next layer found in the model.layers.")
 
         # Set MoE hyperparameters
         self.num_moe_layers = len(self.moe_layers)
diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
index 5df2372a842c..39dd42552ae8 100755
--- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py
+++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only Qwen3-Omni-Moe model (thinker part)."""
 
-import math
 from collections.abc import Callable, Iterable, Mapping, Sequence
 from functools import partial
 from typing import Any
@@ -48,19 +47,19 @@
 from transformers.models.whisper import WhisperFeatureExtractor
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2_audio import Qwen2AudioProcessingInfo
@@ -68,11 +67,11 @@
 from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalKwargsItems
 from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import (
-    BaseMultiModalProcessor,
     MultiModalPromptUpdates,
     PlaceholderFeaturesInfo,
     PromptReplacement,
     PromptUpdate,
+    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
 
@@ -87,11 +86,9 @@
     Qwen2_5OmniConditionalGenerationMixin,
     Qwen2_5OmniThinkerDummyInputsBuilder,
     Qwen2_5OmniThinkerMultiModalProcessor,
-    Qwen2_5OmniThinkerProcessingInfo,
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
-    Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VLProcessingInfo,
 )
 from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
@@ -102,7 +99,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_llm_pos_ids_for_vision,
     get_vit_attn_backend,
 )
@@ -138,16 +134,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         L, C = x.shape
-        x = self.proj(x)
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -221,16 +219,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -332,7 +330,12 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            is_neox_style=True,
+        )
 
         self.blocks = nn.ModuleList(
             [
@@ -377,11 +380,6 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
 
     @property
     def dtype(self) -> torch.dtype:
@@ -416,9 +414,14 @@ def rot_pos_emb(self, grid_thw):
             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
         pos_ids = torch.cat(pos_ids, dim=0)
         max_grid_size = grid_thw[:, 1:].max()
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+
+        return cos_combined, sin_combined
 
     def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
@@ -488,14 +491,11 @@ def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if self.attn_backend == AttentionBackendEnum.FLASH_ATTN:
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -508,7 +508,7 @@ def forward(
         if self.apply_vit_abs_pos_embed:
             pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
             hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw)
 
         cu_seqlens = torch.repeat_interleave(
             grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
@@ -519,8 +519,9 @@ def forward(
         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
 
         hidden_states = hidden_states.unsqueeze(1)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        rotary_pos_emb_cos = rotary_pos_emb_cos.to(hidden_states.device)
+        rotary_pos_emb_sin = rotary_pos_emb_sin.to(hidden_states.device)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
 
         hidden_states_list = []
         deepstack_visual_indexes = self.deepstack_visual_indexes
@@ -529,9 +530,9 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if (
                 deepstack_visual_indexes is not None
@@ -566,9 +567,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -802,24 +800,8 @@ def _maybe_apply_prompt_updates(
                 else:
                     use_audio_in_video = False
 
-        if use_audio_in_video and "video" in mm_item_counts:
-            assert "audio" in mm_item_counts
-            mm_item_counts["audio"] -= mm_item_counts["video"]
-
-        # Special case with `use_audio_in_video=True`
-        if use_audio_in_video:
-            if is_update_applied:
-                prompt_ids = self._get_raw_input_ids(prompt_ids, use_audio_in_video)
-            (
-                prompt_ids,
-                mm_placeholders,
-            ) = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
-            self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
         # normal case with `use_audio_in_video=False`
-        elif is_update_applied:
+        if is_update_applied:
             mm_placeholders = self._find_mm_placeholders(
                 prompt_ids,
                 mm_prompt_updates,
@@ -829,10 +811,24 @@ def _maybe_apply_prompt_updates(
                 mm_item_counts,
             )
         else:
-            prompt_ids, mm_placeholders = self._apply_prompt_updates(
-                prompt_ids,
-                mm_prompt_updates,
-            )
+            if use_audio_in_video and "audio" in mm_prompt_updates:
+                filtered_updates = {
+                    k: v for k, v in mm_prompt_updates.items() if k != "audio"
+                }
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    filtered_updates,
+                )
+                # Derive audio placeholders from video placeholders
+                mm_placeholders = self._derive_audio_from_video_placeholders(
+                    mm_placeholders, mm_prompt_updates
+                )
+            else:
+                prompt_ids, mm_placeholders = self._apply_prompt_updates(
+                    prompt_ids,
+                    mm_prompt_updates,
+                )
+
             self._validate_mm_placeholders(
                 mm_placeholders,
                 mm_item_counts,
@@ -957,7 +953,9 @@ def get_replacement_qwen2_vision(item_idx: int, modality: str):
 
         def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             nonlocal audio_in_video_item_idx
-            audio_num_features = audio_output_lengths[audio_item_idx + item_idx]
+            audio_num_features = audio_output_lengths[
+                audio_in_video_item_idx + item_idx
+            ]
             video_grid_thw = out_mm_data["video_grid_thw"][item_idx]
 
             audio_in_video_item_idx += 1
@@ -966,14 +964,17 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             if second_per_grid_ts:
                 video_second_per_grid_t = second_per_grid_ts[item_idx]
             else:
-                video_second_per_grid_t = 1.0
+                video_second_per_grid_t = 2.0
 
-            return self.get_updates_use_audio_in_video(
+            placeholder = self.get_updates_use_audio_in_video(
                 thinker_config=thinker_config,
                 audio_len=audio_num_features,
                 video_grid_thw=video_grid_thw,
                 video_second_per_grid_t=video_second_per_grid_t,
             )
+            return PromptUpdateDetails.select_token_id(
+                placeholder, embed_token_id=video_token_id
+            )
 
         video_replacement_fn = (
             get_replacement_qwen2_use_audio_in_video
@@ -999,14 +1000,50 @@ def get_replacement_qwen2_use_audio_in_video(item_idx: int):
             ),
         ]
 
-    def _validate_mm_placeholders(
+    def _derive_audio_from_video_placeholders(
         self,
-        mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
-        mm_item_counts: Mapping[str, int],
-    ) -> None:
-        BaseMultiModalProcessor[
-            Qwen2_5OmniThinkerProcessingInfo
-        ]._validate_mm_placeholders(self, mm_placeholders, mm_item_counts)
+        placeholders: Mapping[str, list[PlaceholderFeaturesInfo]],
+        mm_prompt_updates: MultiModalPromptUpdates,
+    ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
+        """
+        Helper to derive audio placeholders from video placeholders when
+        use_audio_in_video=True.
+        """
+        if "video" not in placeholders:
+            return placeholders
+
+        # Validate audio and video counts match
+        num_videos = len(placeholders["video"])
+        num_audios = len(mm_prompt_updates.get("audio", []))
+        if num_audios != num_videos:
+            raise ValueError(
+                f"use_audio_in_video requires equal number of audio and video items, "
+                f"got {num_audios=}, {num_videos=}"
+            )
+
+        tokenizer = self.info.get_tokenizer()
+        processor = self.info.get_hf_processor()
+        audio_token_id = tokenizer.get_vocab()[processor.audio_token]
+
+        result_placeholders = dict(placeholders)
+        audio_placeholders = []
+
+        # Each video is paired with one audio
+        for video_idx, video_placeholder in enumerate(placeholders["video"]):
+            # Create is_embed mask selecting only audio tokens
+            audio_is_embed = torch.tensor(video_placeholder.tokens) == audio_token_id
+
+            audio_placeholder = PlaceholderFeaturesInfo(
+                modality="audio",
+                item_idx=video_idx,
+                start_idx=video_placeholder.start_idx,
+                tokens=video_placeholder.tokens,
+                is_embed=audio_is_embed,
+            )
+            audio_placeholders.append(audio_placeholder)
+
+        result_placeholders["audio"] = audio_placeholders
+        return result_placeholders
 
     def _get_raw_input_ids(
         self,
@@ -1449,7 +1486,11 @@ def get_mrope_input_positions(
             )
 
         if not len(second_per_grid_ts) and len(video_grid_thw):
-            second_per_grids = torch.ones(len(video_grid_thw), dtype=torch.float32)
+            second_per_grid_ts = 2.0
+            second_per_grids = (
+                torch.ones(len(video_grid_thw), dtype=torch.float32)
+                * second_per_grid_ts
+            )
         else:
             second_per_grids = torch.tensor(second_per_grid_ts, dtype=torch.float32)
 
diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py
index 5f5bde1dd72d..1d3929b936a9 100644
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -24,9 +24,8 @@
 # limitations under the License.
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
-import math
-from collections.abc import Callable, Iterable, Mapping, Sequence
-from functools import partial
+from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+from functools import lru_cache, partial
 from itertools import islice
 from typing import Any
 
@@ -50,20 +49,20 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.attention.backends.registry import AttentionBackendEnum
-from vllm.attention.layer import check_upstream_fa_availability
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
 from vllm.distributed import get_pp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import _ACTIVATION_REGISTRY
+from vllm.model_executor.layers.conv import Conv3dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
-    ReplicatedLinear,
     RowParallelLinear,
 )
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -89,6 +88,7 @@
 
 from .interfaces import (
     MultiModalEmbeddings,
+    SupportsEagle3,
     SupportsLoRA,
     SupportsMRoPE,
     SupportsMultiModal,
@@ -96,7 +96,6 @@
 )
 from .qwen2_5_vl import (
     Qwen2_5_VisionAttention,
-    Qwen2_5_VisionRotaryEmbedding,
     Qwen2_5_VLImageEmbeddingInputs,
     Qwen2_5_VLImageInputs,
     Qwen2_5_VLImagePixelInputs,
@@ -114,7 +113,6 @@
     maybe_prefix,
 )
 from .vision import (
-    conv3d_to_linear_weight,
     get_vit_attn_backend,
     run_dp_sharded_mrope_vision_model,
 )
@@ -139,15 +137,18 @@ def __init__(
         self.hidden_size = hidden_size
 
         kernel_size = (temporal_patch_size, patch_size, patch_size)
-        self.proj = ReplicatedLinear(
-            in_channels * math.prod(kernel_size),
+        self.proj = Conv3dLayer(
+            in_channels,
             hidden_size,
+            kernel_size=kernel_size,
+            stride=kernel_size,
             bias=True,
-            return_bias=False,
         )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        x = self.proj(x)
+        L, C = x.shape
+        x = x.view(L, -1, self.temporal_patch_size, self.patch_size, self.patch_size)
+        x = self.proj(x).view(L, self.hidden_size)
         return x
 
 
@@ -200,7 +201,6 @@ def __init__(
         prefix: str = "",
         use_data_parallel: bool = False,
         attn_backend: AttentionBackendEnum = AttentionBackendEnum.TORCH_SDPA,
-        use_upstream_fa: bool = False,
     ) -> None:
         super().__init__()
         if norm_layer is None:
@@ -215,7 +215,6 @@ def __init__(
             prefix=f"{prefix}.attn",
             use_data_parallel=use_data_parallel,
             attn_backend=attn_backend,
-            use_upstream_fa=use_upstream_fa,
         )
         self.mlp = Qwen3_VisionMLP(
             dim,
@@ -231,16 +230,16 @@ def forward(
         self,
         x: torch.Tensor,
         cu_seqlens: torch.Tensor,
-        rotary_pos_emb: torch.Tensor,
+        rotary_pos_emb_cos: torch.Tensor,
+        rotary_pos_emb_sin: torch.Tensor,
         max_seqlen: torch.Tensor,  # Only used for Flash Attention
-        seqlens: torch.Tensor,  # Only used for xFormers
     ) -> torch.Tensor:
         x = x + self.attn(
             self.norm1(x),
             cu_seqlens=cu_seqlens,
-            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_emb_cos=rotary_pos_emb_cos,
+            rotary_pos_emb_sin=rotary_pos_emb_sin,
             max_seqlen=max_seqlen,
-            seqlens=seqlens,
         )
 
         x = x + self.mlp(self.norm2(x))
@@ -338,7 +337,12 @@ def __init__(
 
         norm_layer = partial(nn.LayerNorm, eps=norm_eps)
         head_dim = self.hidden_size // self.num_heads
-        self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
+        self.rotary_pos_emb = get_rope(
+            head_size=head_dim,
+            rotary_dim=head_dim // 2,
+            max_position=8192,
+            is_neox_style=True,
+        )
 
         self.merger = Qwen3_VisionPatchMerger(
             d_model=vision_config.out_hidden_size,
@@ -371,19 +375,10 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        use_upstream_fa = False
-        if (
-            self.attn_backend != AttentionBackendEnum.FLASH_ATTN
-            and self.attn_backend != AttentionBackendEnum.ROCM_AITER_FA
-            and check_upstream_fa_availability(torch.get_default_dtype())
-        ):
-            self.attn_backend = AttentionBackendEnum.FLASH_ATTN
-            use_upstream_fa = True
 
         if self.attn_backend not in {
             AttentionBackendEnum.FLASH_ATTN,
             AttentionBackendEnum.TORCH_SDPA,
-            AttentionBackendEnum.XFORMERS,
             AttentionBackendEnum.ROCM_AITER_FA,
         }:
             raise RuntimeError(
@@ -401,7 +396,6 @@ def __init__(
                     prefix=f"{prefix}.blocks.{layer_idx}",
                     use_data_parallel=use_data_parallel,
                     attn_backend=self.attn_backend,
-                    use_upstream_fa=use_upstream_fa,
                 )
                 for layer_idx in range(vision_config.depth)
             ]
@@ -415,34 +409,50 @@ def dtype(self) -> torch.dtype:
     def device(self) -> torch.device:
         return self.patch_embed.proj.weight.device
 
+    @staticmethod
+    @lru_cache(maxsize=1024)
+    def rot_pos_ids(h: int, w: int, spatial_merge_size: int) -> torch.Tensor:
+        hpos_ids = np.broadcast_to(np.arange(h).reshape(h, 1), (h, w))
+        h_div = h // spatial_merge_size
+        w_div = w // spatial_merge_size
+        hpos_ids = hpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        hpos_ids = hpos_ids.transpose(0, 2, 1, 3)
+        hpos_ids = hpos_ids.flatten()
+
+        wpos_ids = np.broadcast_to(np.arange(w).reshape(1, w), (h, w))
+        wpos_ids = wpos_ids.reshape(
+            h_div,
+            spatial_merge_size,
+            w_div,
+            spatial_merge_size,
+        )
+        wpos_ids = wpos_ids.transpose(0, 2, 1, 3)
+        wpos_ids = wpos_ids.flatten()
+
+        return torch.from_numpy(np.stack([hpos_ids, wpos_ids], axis=-1))
+
     def rot_pos_emb(self, grid_thw: list[list[int]]):
-        pos_ids = []
         max_grid_size = max(max(h, w) for _, h, w in grid_thw)
-        for t, h, w in grid_thw:
-            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
-            hpos_ids = hpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
-            hpos_ids = hpos_ids.flatten()
-
-            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
-            wpos_ids = wpos_ids.reshape(
-                h // self.spatial_merge_size,
-                self.spatial_merge_size,
-                w // self.spatial_merge_size,
-                self.spatial_merge_size,
-            )
-            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
-            wpos_ids = wpos_ids.flatten()
-            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
-        pos_ids = torch.cat(pos_ids, dim=0)
-        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
-        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
-        return rotary_pos_emb
+        pos_ids = [
+            self.rot_pos_ids(h, w, self.spatial_merge_size)
+            if t == 1
+            else self.rot_pos_ids(h, w, self.spatial_merge_size).repeat(t, 1)
+            for t, h, w in grid_thw
+        ]
+        pos_ids = torch.cat(pos_ids, dim=0).to(self.device, non_blocking=True)
+
+        # Use pre-computed cos_sin_cache from RotaryEmbedding
+        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)
+
+        cos_combined = cos[pos_ids].flatten(1)
+        sin_combined = sin[pos_ids].flatten(1)
+
+        return cos_combined, sin_combined
 
     def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
         num_grid_per_side = self.num_grid_per_side
@@ -507,17 +517,14 @@ def fast_pos_embed_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
     def compute_attn_mask_seqlen(
         self,
         cu_seqlens: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
         max_seqlen = torch.zeros([], device=cu_seqlens.device)
-        seqlens = torch.zeros(1, device=cu_seqlens.device)
         if (
             self.attn_backend == AttentionBackendEnum.FLASH_ATTN
             or self.attn_backend == AttentionBackendEnum.ROCM_AITER_FA
         ):
             max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        elif self.attn_backend == AttentionBackendEnum.XFORMERS:
-            seqlens = cu_seqlens[1:] - cu_seqlens[:-1]
-        return max_seqlen, seqlens
+        return max_seqlen
 
     def forward(
         self,
@@ -529,22 +536,23 @@ def forward(
 
         if isinstance(grid_thw, list):
             grid_thw_list = grid_thw
-            grid_thw = torch.tensor(grid_thw, dtype=torch.int32)
+            grid_thw = np.array(grid_thw, dtype=np.int32)
         else:
             grid_thw_list = grid_thw.tolist()
+            grid_thw = grid_thw.numpy()
 
         pos_embeds = self.fast_pos_embed_interpolate(grid_thw_list)
         hidden_states = hidden_states + pos_embeds
-        rotary_pos_emb = self.rot_pos_emb(grid_thw_list)
-        rotary_pos_emb = rotary_pos_emb.to(hidden_states.device, non_blocking=True)
+        rotary_pos_emb_cos, rotary_pos_emb_sin = self.rot_pos_emb(grid_thw_list)
 
-        cu_seqlens = torch.repeat_interleave(
-            grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]
-        ).cumsum(dim=0, dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32)
-        cu_seqlens = torch.cat([cu_seqlens.new_zeros(1), cu_seqlens])
+        cu_seqlens = np.repeat(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            axis=0, dtype=np.int32
+        )
+        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])
+        cu_seqlens = torch.from_numpy(cu_seqlens)
 
         hidden_states = hidden_states.unsqueeze(1)
-        max_seqlen, seqlens = self.compute_attn_mask_seqlen(cu_seqlens)
+        max_seqlen = self.compute_attn_mask_seqlen(cu_seqlens)
         cu_seqlens = cu_seqlens.to(self.device, non_blocking=True)
 
         deepstack_feature_lists = []
@@ -552,9 +560,9 @@ def forward(
             hidden_states = blk(
                 hidden_states,
                 cu_seqlens=cu_seqlens,
-                rotary_pos_emb=rotary_pos_emb,
+                rotary_pos_emb_cos=rotary_pos_emb_cos,
+                rotary_pos_emb_sin=rotary_pos_emb_sin,
                 max_seqlen=max_seqlen,
-                seqlens=seqlens,
             )
             if layer_num in self.deepstack_visual_indexes:
                 deepstack_merger_idx = self.deepstack_visual_indexes.index(layer_num)
@@ -579,9 +587,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         loaded_params: set[str] = set()
 
         for name, loaded_weight in weights:
-            if name.endswith("patch_embed.proj.weight"):
-                loaded_weight = conv3d_to_linear_weight(loaded_weight)
-
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
@@ -610,9 +615,6 @@ def get_hf_processor(self, **kwargs: object) -> Qwen3VLProcessor:
             **kwargs,
         )
 
-    def get_tokenizer(self):
-        return self.ctx.tokenizer
-
     def get_image_processor(self, **kwargs: object) -> Qwen2VLImageProcessorFast:
         return self.get_hf_processor(**kwargs).image_processor
 
@@ -1106,9 +1108,14 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
             residual = intermediate_tensors["residual"]
+
+        aux_hidden_states = []
         for layer_idx, layer in islice(
             enumerate(self.layers), self.start_layer, self.end_layer
         ):
+            if layer_idx in self.aux_hidden_state_layers:
+                aux_hidden_states.append(hidden_states + residual)
+
             hidden_states, residual = layer(
                 positions,
                 hidden_states,
@@ -1128,6 +1135,9 @@ def forward(
                 {"hidden_states": hidden_states, "residual": residual}
             )
         hidden_states, _ = self.norm(hidden_states, residual)
+
+        if len(aux_hidden_states) > 0:
+            return hidden_states, aux_hidden_states
         return hidden_states
 
 
@@ -1140,7 +1150,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
 
         self.quant_config = quant_config
-        self.model = Qwen3LLMModel(vllm_config=vllm_config, prefix=prefix)
+        self.model = Qwen3LLMModel(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
@@ -1168,7 +1180,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
 class Qwen3VLForConditionalGeneration(
-    nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP, SupportsMRoPE
+    nn.Module,
+    SupportsMultiModal,
+    SupportsLoRA,
+    SupportsPP,
+    SupportsMRoPE,
+    SupportsEagle3,
 ):
     merge_by_field_config = True
     multimodal_cpu_fields = {"image_grid_thw", "video_grid_thw"}
@@ -1261,6 +1278,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
         self.visual_dim = config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
 
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.language_model.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.language_model.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
     def _get_deepstack_input_embeds(self, num_tokens: int) -> IntermediateTensors:
         # get deepstack_input_embeds from buffer, and clear the buffer
         return IntermediateTensors(
@@ -1412,72 +1436,47 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
                 )
         return mm_input_by_modality
 
+    def iter_mm_grid_hw(
+        self, input_tokens: list[int], mm_features: list[MultiModalFeatureSpec]
+    ) -> Iterator[tuple[int, int, int]]:
+        video_token_id = self.config.video_token_id
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
+            offset = mm_feature.mm_position.offset
+            if mm_feature.modality == "image":
+                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
+                assert t == 1, f"Image must have 1 frame, got {t}"
+                yield offset, h // spatial_merge_size, w // spatial_merge_size
+            elif mm_feature.modality == "video":
+                t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
+                llm_grid_h = h // spatial_merge_size
+                llm_grid_w = w // spatial_merge_size
+                for _ in range(t):
+                    offset = input_tokens.index(video_token_id, offset)
+                    yield offset, llm_grid_h, llm_grid_w
+                    offset += llm_grid_h * llm_grid_w
+            else:
+                raise ValueError(f"Unsupported modality: {mm_feature.modality}")
+
     def get_mrope_input_positions(
         self,
         input_tokens: list[int],
         mm_features: list[MultiModalFeatureSpec],
     ) -> tuple[torch.Tensor, int]:
-        kwargs = MultiModalFeatureSpec.gather_kwargs(
-            mm_features,
-            {"image_grid_thw", "video_grid_thw"},
-        )
-        image_grid_thw = [item.tolist() for item in kwargs.get("image_grid_thw", [])]
-        video_grid_thw = [item.tolist() for item in kwargs.get("video_grid_thw", [])]
-
-        video_grid_thw = [[1, h, w] for t, h, w in video_grid_thw for _ in range(t)]
-
-        hf_config = self.config
-        image_token_id = hf_config.image_token_id
-        video_token_id = hf_config.video_token_id
-        vision_start_token_id = hf_config.vision_start_token_id
-        spatial_merge_size = hf_config.vision_config.spatial_merge_size
-
-        input_tokens_array = np.array(input_tokens)
-        vision_start_mask = input_tokens_array == vision_start_token_id
-        vision_tokens = input_tokens_array[vision_start_mask.nonzero()[0] + 1]
-        image_nums = np.count_nonzero(vision_tokens == image_token_id)
-        video_nums = np.count_nonzero(vision_tokens == video_token_id)
-        llm_pos_ids_list: list = []
-
+        llm_pos_ids_list = []
         st = 0
-        remain_images, remain_videos = image_nums, video_nums
-
-        image_index, video_index = 0, 0
-        for _ in range(image_nums + video_nums):
-            if image_token_id in input_tokens and remain_images > 0:
-                ed_image = input_tokens.index(image_token_id, st)
-            else:
-                ed_image = len(input_tokens) + 1
-            if video_token_id in input_tokens and remain_videos > 0:
-                ed_video = input_tokens.index(video_token_id, st)
-            else:
-                ed_video = len(input_tokens) + 1
-            if ed_image < ed_video:
-                t, h, w = image_grid_thw[image_index]
-                image_index += 1
-                remain_images -= 1
-                ed = ed_image
-            else:
-                t, h, w = video_grid_thw[video_index]
-                video_index += 1
-                remain_videos -= 1
-                ed = ed_video
-
-            llm_grid_t, llm_grid_h, llm_grid_w = (
-                t,
-                h // spatial_merge_size,
-                w // spatial_merge_size,
-            )
-            text_len = ed - st
-
+        for offset, llm_grid_h, llm_grid_w in self.iter_mm_grid_hw(
+            input_tokens, mm_features
+        ):
+            text_len = offset - st
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
             llm_pos_ids_list.append(
                 np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
             )
 
-            grid_indices = np.indices((llm_grid_t, llm_grid_h, llm_grid_w))
-            llm_pos_ids_list.append(grid_indices.reshape(3, -1) + text_len + st_idx)
-            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+            grid_indices = np.indices((1, llm_grid_h, llm_grid_w)).reshape(3, -1)
+            llm_pos_ids_list.append(grid_indices + text_len + st_idx)
+            st = offset + llm_grid_h * llm_grid_w
 
         if st < len(input_tokens):
             st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py
index 5c3205faf9c2..a054bd5b3831 100644
--- a/vllm/model_executor/models/qwen3_vl_moe.py
+++ b/vllm/model_executor/models/qwen3_vl_moe.py
@@ -15,7 +15,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#      http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -29,7 +29,9 @@
 from itertools import islice
 
 import torch
-from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import Qwen3VLMoeConfig
+from transformers.models.qwen3_vl_moe.configuration_qwen3_vl_moe import (
+    Qwen3VLMoeConfig,
+)
 
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import VllmConfig
@@ -44,7 +46,12 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
 
-from .qwen3_moe import Qwen3MoeForCausalLM, Qwen3MoeModel
+from .interfaces import MixtureOfExperts
+from .qwen3_moe import (
+    Qwen3MoeForCausalLM,
+    Qwen3MoeModel,
+    Qwen3MoeSparseMoeBlock,
+)
 from .qwen3_vl import (
     Qwen3_VisionTransformer,
     Qwen3VLDummyInputsBuilder,
@@ -344,12 +351,57 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         )
 
 
+class Qwen3VLMoeMixtureOfExperts(MixtureOfExperts):
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.language_model.model.layers:
+            if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.language_model.model.layers:
+            if hasattr(layer, "mlp") and isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            raise RuntimeError("No Qwen3Moe layer found in the language_model.")
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+
 @MULTIMODAL_REGISTRY.register_processor(
     Qwen3VLMultiModalProcessor,
     info=Qwen3VLMoeProcessingInfo,
     dummy_inputs=Qwen3VLDummyInputsBuilder,
 )
-class Qwen3VLMoeForConditionalGeneration(Qwen3VLForConditionalGeneration):
+class Qwen3VLMoeForConditionalGeneration(
+    Qwen3VLForConditionalGeneration, Qwen3VLMoeMixtureOfExperts
+):
+    is_3d_moe_weight: bool = True
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -413,3 +465,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.deepstack_input_embeds = None
         self.visual_dim = config.vision_config.out_hidden_size
         self.multiscale_dim = self.visual_dim * self.deepstack_num_level
+
+        # Set MoE hyperparameters
+        self.set_moe_parameters()
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 6a259cade9cf..55680b8e7ddf 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -25,6 +25,7 @@
 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     ReplicatedLinear,
@@ -333,7 +334,7 @@ def __init__(
         patch_height, patch_width = self.patch_size = (patch_size, patch_size)
         self.grid_size = (image_height // patch_height, image_width // patch_width)
         self.output_dim = output_dim
-        self.conv1 = nn.Conv2d(
+        self.conv1 = Conv2dLayer(
             in_channels=3,
             out_channels=width,
             kernel_size=patch_size,
@@ -554,7 +555,7 @@ def __call__(
 
 class QwenVLProcessingInfo(BaseProcessingInfo):
     def get_tokenizer(self) -> PreTrainedTokenizer:
-        tokenizer = self.ctx.tokenizer
+        tokenizer = self.ctx.get_tokenizer()
         assert isinstance(tokenizer, PreTrainedTokenizer)
 
         return _get_tokenizer_without_image_pad(tokenizer)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 4af8fa01f562..73a61f1148b5 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -5,7 +5,6 @@
 `tests/models/registry.py` with example HuggingFace models for it.
 """
 
-import hashlib
 import importlib
 import json
 import os
@@ -18,7 +17,7 @@
 from dataclasses import asdict, dataclass, field
 from functools import lru_cache
 from pathlib import Path
-from typing import TypeVar
+from typing import TYPE_CHECKING, Any, TypeVar
 
 import torch.nn as nn
 import transformers
@@ -32,6 +31,15 @@
 from vllm.logger import init_logger
 from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import try_get_class_from_dynamic_module
+from vllm.utils.hashing import safe_hash
+
+if TYPE_CHECKING:
+    from vllm.config.model import AttnTypeStr
+    from vllm.config.pooler import PoolingTypeStr
+else:
+    AttnTypeStr = Any
+    PoolingTypeStr = Any
+
 
 from .interfaces import (
     has_inner_state,
@@ -47,6 +55,7 @@
     supports_transcription,
 )
 from .interfaces_base import (
+    get_attn_type,
     get_default_pooling_type,
     is_pooling_model,
     is_text_generation_model,
@@ -56,6 +65,7 @@
 
 _TEXT_GENERATION_MODELS = {
     # [Decoder-only]
+    "AfmoeForCausalLM": ("afmoe", "AfmoeForCausalLM"),
     "ApertusForCausalLM": ("apertus", "ApertusForCausalLM"),
     "AquilaModel": ("llama", "LlamaForCausalLM"),
     "AquilaForCausalLM": ("llama", "LlamaForCausalLM"),  # AquilaChat2
@@ -156,6 +166,7 @@
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
     "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"),
     "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"),
+    "Plamo3ForCausalLM": ("plamo3", "Plamo3ForCausalLM"),
     "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
     "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
@@ -168,6 +179,7 @@
     "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleFLMForCausalLM": ("teleflm", "TeleFLMForCausalLM"),
     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
@@ -205,6 +217,7 @@
     "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"),
     "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"),
     "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    "TeleChatForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     # [Multimodal]
@@ -285,8 +298,16 @@
         "GraniteSpeechForConditionalGeneration",
     ),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
+    "HunYuanVLForConditionalGeneration": (
+        "hunyuan_vision",
+        "HunYuanVLForConditionalGeneration",
+    ),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
     "NemotronH_Nano_VL_V2": ("nano_nemotron_vl", "NemotronH_Nano_VL_V2"),
+    "OpenCUAForConditionalGeneration": (
+        "opencua",
+        "OpenCUAForConditionalGeneration",
+    ),
     "InternS1ForConditionalGeneration": (
         "interns1",
         "InternS1ForConditionalGeneration",
@@ -402,6 +423,7 @@
     "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "LlamaForCausalLMEagle3": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "Eagle3Qwen2_5vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
+    "Eagle3Qwen3vlForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"),
     "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
     "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
     "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
@@ -496,7 +518,8 @@ class _ModelInfo:
     architecture: str
     is_text_generation_model: bool
     is_pooling_model: bool
-    default_pooling_type: str
+    attn_type: AttnTypeStr
+    default_pooling_type: PoolingTypeStr
     supports_cross_encoding: bool
     supports_multimodal: bool
     supports_multimodal_raw_input_only: bool
@@ -517,6 +540,7 @@ def from_model_cls(model: type[nn.Module]) -> "_ModelInfo":
             is_text_generation_model=is_text_generation_model(model),
             is_pooling_model=is_pooling_model(model),
             default_pooling_type=get_default_pooling_type(model),
+            attn_type=get_attn_type(model),
             supports_cross_encoding=supports_cross_encoding(model),
             supports_multimodal=supports_multimodal(model),
             supports_multimodal_raw_input_only=supports_multimodal_raw_input_only(
@@ -596,7 +620,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
                     mi_dict = json.load(file)
             except FileNotFoundError:
                 logger.debug(
-                    ("Cached model info file for class %s.%s not found"),
+                    "Cached model info file for class %s.%s not found",
                     self.module_name,
                     self.class_name,
                 )
@@ -604,7 +628,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
 
             if mi_dict["hash"] != module_hash:
                 logger.debug(
-                    ("Cached model info file for class %s.%s is stale"),
+                    "Cached model info file for class %s.%s is stale",
                     self.module_name,
                     self.class_name,
                 )
@@ -614,7 +638,7 @@ def _load_modelinfo_from_cache(self, module_hash: str) -> _ModelInfo | None:
             return _ModelInfo(**mi_dict["modelinfo"])
         except Exception:
             logger.debug(
-                ("Cached model info for class %s.%s error. "),
+                "Cached model info for class %s.%s error. ",
                 self.module_name,
                 self.class_name,
             )
@@ -644,19 +668,19 @@ def inspect_model_cls(self) -> _ModelInfo:
 
         if model_path.exists():
             with open(model_path, "rb") as f:
-                module_hash = hashlib.md5(f.read(), usedforsecurity=False).hexdigest()
+                module_hash = safe_hash(f.read(), usedforsecurity=False).hexdigest()
 
             mi = self._load_modelinfo_from_cache(module_hash)
             if mi is not None:
                 logger.debug(
-                    ("Loaded model info for class %s.%s from cache"),
+                    "Loaded model info for class %s.%s from cache",
                     self.module_name,
                     self.class_name,
                 )
                 return mi
             else:
                 logger.debug(
-                    ("Cache model info for class %s.%s miss. Loading model instead."),
+                    "Cache model info for class %s.%s miss. Loading model instead.",
                     self.module_name,
                     self.class_name,
                 )
diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py
index bf211d28f184..267c60157506 100644
--- a/vllm/model_executor/models/seed_oss.py
+++ b/vllm/model_executor/models/seed_oss.py
@@ -30,7 +30,8 @@
 from torch import nn
 from transformers import PretrainedConfig as SeedOssConfig
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -54,6 +55,7 @@
     maybe_remap_kv_scale_name,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.config import set_default_rope_theta
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (
@@ -112,11 +114,10 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         head_dim: int,
+        rope_parameters: dict,
         max_position: int = 4096 * 32,
-        rope_theta: float = 10000,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
-        rope_scaling: tuple | None = None,
         prefix: str = "",
         attn_type: str = AttentionType.DECODER,
     ) -> None:
@@ -140,7 +141,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
 
         self.qkv_proj = QKVParallelLinear(
             hidden_size,
@@ -163,8 +163,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position,
-            base=self.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -200,9 +199,7 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        # Requires transformers > 4.32.0
-        rope_theta = getattr(config, "rope_theta", 1000000)
-        rope_scaling = getattr(config, "rope_scaling", None)
+        set_default_rope_theta(config, default_theta=1000000)
 
         # By default, SeedOss uses causal attention as it is a
         # decoder-only model.
@@ -219,10 +216,9 @@ def __init__(
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
             head_dim=config.head_dim,
-            rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
             attn_type=attn_type,
         )
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index b175dd60cf65..9db1423d98e0 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -1,10 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Implementation of SiglipVisionModel intended to be only used
-within a vision language model."""
 
 import math
-from collections.abc import Iterable, Mapping
+from collections.abc import Callable, Iterable, Mapping
 from functools import cached_property
 from typing import Annotated, Literal
 
@@ -24,6 +22,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -286,7 +285,7 @@ def __init__(self, config: SiglipVisionConfig):
         self.image_size = config.image_size
         self.patch_size = config.patch_size
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -827,6 +826,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
+        self.quant_config = quant_config
         self.vision_model = SiglipVisionTransformer(
             config,
             quant_config,
@@ -911,12 +911,38 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 break
             else:
                 param = params_dict[name]
+                param = maybe_swap_ffn_param(
+                    name, param, loaded_weight, params_dict, self.quant_config
+                )
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
         return loaded_params
 
 
+def maybe_swap_ffn_param(
+    name: str,
+    param: torch.Tensor,
+    loaded_weight: torch.Tensor,
+    params_dict: dict[str, torch.Tensor],
+    quant_config: QuantizationConfig,
+) -> torch.Tensor:
+    if not (quant_config and quant_config.get_name() == "gguf") or ".fc" not in name:
+        return param
+    # Some GGUF models have fc1 and fc2 weights swapped
+    tp_size = get_tensor_model_parallel_world_size()
+    output_dim = getattr(param, "output_dim", 0)
+    output_size = param.size(output_dim) * tp_size
+    weight_out_size = loaded_weight.size(output_dim)
+    if ".fc1." in name and output_size != weight_out_size:
+        new_name = name.replace(".fc1.", ".fc2.")
+        param = params_dict[new_name]
+    elif ".fc2." in name and output_size != weight_out_size:
+        new_name = name.replace(".fc2.", ".fc1.")
+        param = params_dict[new_name]
+    return param
+
+
 # Adapted from: https://github.com/huggingface/transformers/blob/v4.54.1/src/transformers/models/siglip/modeling_siglip.py#L200
 class SiglipTextEmbeddings(nn.Module):
     def __init__(self, config: SiglipTextConfig):
@@ -948,6 +974,7 @@ def forward(
 
         position_embeddings = self.position_embedding(position_ids)
         embeddings = inputs_embeds + position_embeddings
+
         return embeddings
 
 
@@ -1117,6 +1144,41 @@ def _process_image_inputs(self, inputs: SiglipImagePixelInputs) -> torch.Tensor:
     def get_language_model(self) -> torch.nn.Module:
         return self.text_model
 
+    def _embed_text_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        embed_input_ids: Callable[[torch.Tensor], torch.Tensor],
+        *,
+        is_multimodal: torch.Tensor | None,
+        handle_oov_mm_token: bool,
+    ) -> torch.Tensor:
+        inputs_embeds = super()._embed_text_input_ids(
+            input_ids,
+            embed_input_ids,
+            is_multimodal=is_multimodal,
+            handle_oov_mm_token=handle_oov_mm_token,
+        )
+
+        # NOTE: inputs_embeds in model runner has size text_config.projection_size
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        inputs_embeds_size = self.text_projection_size
+        if inputs_embeds.shape[1] < inputs_embeds_size:
+            inputs_embeds = torch.cat(
+                [
+                    inputs_embeds,
+                    inputs_embeds.new_empty(
+                        inputs_embeds.shape[0],
+                        inputs_embeds_size - inputs_embeds.shape[1],
+                    ),
+                ],
+                dim=1,
+            )
+        elif inputs_embeds.shape[1] > inputs_embeds_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
+        return inputs_embeds
+
     def embed_input_ids(
         self,
         input_ids: torch.Tensor,
@@ -1162,6 +1224,15 @@ def forward(
         if not self._is_text_input:
             return inputs_embeds
 
+        # NOTE: inputs_embeds in model runner has size text_config.projection_size
+        # (instead of text_config.hidden_size) to accommodate image embeddings
+        hidden_size = self.text_embed_dim
+        if inputs_embeds.shape[1] > hidden_size:
+            inputs_embeds = inputs_embeds[:, :hidden_size]
+        elif inputs_embeds.shape[1] < hidden_size:
+            # No need to handle this case for now
+            raise NotImplementedError
+
         return self.get_text_features(input_ids, positions, inputs_embeds)
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
index 29dd164ad37f..bbce01995412 100644
--- a/vllm/model_executor/models/siglip2navit.py
+++ b/vllm/model_executor/models/siglip2navit.py
@@ -16,6 +16,7 @@
 from vllm.attention.layer import maybe_get_vit_flash_attn_backend
 from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     LinearBase,
@@ -67,7 +68,7 @@ def __init__(self, config: PretrainedConfig):
                 self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
 
         else:
-            self.patch_embedding = nn.Conv2d(
+            self.patch_embedding = Conv2dLayer(
                 in_channels=config.num_channels,
                 out_channels=self.embed_dim,
                 kernel_size=self.patch_size,
@@ -99,7 +100,7 @@ def forward(
         target_dtype = self.patch_embedding.weight.dtype
         if isinstance(self.patch_embedding, LinearBase):
             patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
-        elif isinstance(self.patch_embedding, nn.Conv2d):
+        elif isinstance(self.patch_embedding, Conv2dLayer):
             pixel_values = pixel_values.view(
                 -1,
                 self.config.num_channels * self.config.temporal_patch_size,
@@ -190,7 +191,7 @@ def apply_rotary_pos_emb(
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
     if is_flash_attn_backend and not current_platform.is_xpu():
-        from flash_attn.layers.rotary import apply_rotary_emb
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
         apply_rotary_emb_func = apply_rotary_emb
     else:
@@ -254,12 +255,10 @@ def __init__(
             dtype=torch.get_default_dtype(),
             attn_backend_override=attn_backend_override,
         )
-        self.use_upstream_fa = False
 
         self.attn_backend, self.flash_attn_varlen_func = (
             maybe_get_vit_flash_attn_backend(
                 self.attn_backend,
-                self.use_upstream_fa,
                 attn_backend_override=attn_backend_override,
             )
         )
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index d825eb3a1c13..55c25ce6190f 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -47,7 +47,7 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -282,7 +282,7 @@ class SkyworkR1VProcessor:
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
         *,
         min_dynamic_patch: int | None = None,
         max_dynamic_patch: int | None = None,
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 4ec855f79444..7bef56110cab 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -25,13 +25,12 @@
 """Inference-only Solar model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
-from typing import Any
 
 import torch
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -111,8 +110,6 @@ def __init__(
         hidden_size: int,
         num_heads: int,
         num_kv_heads: int,
-        rope_theta: float = 10000,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embeddings: int = 8192,
         quant_config: QuantizationConfig | None = None,
         bias: bool = False,
@@ -142,7 +139,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
 
         self.qkv_proj = QKVParallelLinear(
@@ -166,8 +162,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
         )
         self.attn = Attention(
             self.num_heads,
@@ -202,15 +197,6 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
-        rope_theta = getattr(config, "rope_theta", 10000)
-        rope_scaling = getattr(config, "rope_scaling", None)
-
-        if rope_scaling is not None and getattr(
-            config, "original_max_position_embeddings", None
-        ):
-            rope_scaling["original_max_position_embeddings"] = (
-                config.original_max_position_embeddings
-            )
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
         # Support abacusai/Smaug-72B-v0.1 with attention_bias
         # Support internlm/internlm-7b with bias
@@ -224,8 +210,6 @@ def __init__(
             num_kv_heads=getattr(
                 config, "num_key_value_heads", config.num_attention_heads
             ),
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             bias=attention_bias,
@@ -442,7 +426,6 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "embed_tokens": "input_embeddings",
         "lm_head": "output_embeddings",
     }
-    embedding_padding_modules = ["lm_head"]
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 06eb7201c1a8..65092584edce 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -19,7 +19,7 @@
 # This code is based off the following work:
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
-"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
+"""Inference-only StableLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
 
 from collections.abc import Iterable
@@ -29,7 +29,7 @@
 from torch import nn
 from transformers import StableLmConfig
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -153,7 +153,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.config.max_position_embeddings,
-            base=self.config.rope_theta,
+            rope_parameters=self.config.rope_parameters,
             partial_rotary_factor=self.partial_rotary_factor,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 0f2942acd500..46422f303ff4 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -28,7 +28,7 @@
 from torch import nn
 from transformers import Starcoder2Config
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -91,7 +91,6 @@ def __init__(
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
-        self.rope_theta = config.rope_theta
         self.max_position_embeddings = config.max_position_embeddings
         self.use_bias = config.use_bias
 
@@ -115,7 +114,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=self.max_position_embeddings,
-            base=int(self.rope_theta),
+            rope_parameters=config.rope_parameters,
             is_neox_style=True,
         )
         self.attn = Attention(
diff --git a/vllm/model_executor/models/step3_text.py b/vllm/model_executor/models/step3_text.py
index 4fff356b29e2..077cce84a98d 100644
--- a/vllm/model_executor/models/step3_text.py
+++ b/vllm/model_executor/models/step3_text.py
@@ -9,7 +9,7 @@
 import torch
 from torch import nn
 
-from vllm.attention import Attention
+from vllm.attention.layer import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, ModelConfig, VllmConfig
 from vllm.distributed import (
@@ -36,6 +36,7 @@
 )
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.step3_vl import Step3TextConfig
 
 from .interfaces import SupportsPP
 from .utils import (
@@ -144,9 +145,8 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         norm_eps: float,
-        rope_theta: int,
+        rope_parameters: dict[str, Any],
         share_q_dim: int | None = None,
-        rope_scaling: dict[str, Any] | None = None,
         max_position_embedding: int = 8192,
         head_dim: int = 256,
         cache_config: CacheConfig | None = None,
@@ -198,8 +198,7 @@ def __init__(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embedding,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=rope_parameters,
         )
         scaling = self.head_dim**-0.5
         self.attn = Attention(
@@ -227,15 +226,13 @@ def forward(
 class Step3TextDecoderLayer(nn.Module):
     def __init__(
         self,
-        config: ModelConfig,
+        config: Step3TextConfig,
         cache_config: CacheConfig | None = None,
         quant_config: QuantizationConfig | None = None,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        config = config.hf_config
         self.hidden_size = config.hidden_size
-        rope_scaling = getattr(config, "rope_scaling", None)
 
         self.self_attn = Step3TextAttention(
             hidden_size=self.hidden_size,
@@ -247,8 +244,7 @@ def __init__(
             max_position_embedding=config.max_position_embedding,
             head_dim=config.head_dim,
             share_q_dim=config.share_q_dim,
-            rope_theta=config.rope_theta,
-            rope_scaling=rope_scaling,
+            rope_parameters=config.rope_parameters,
             prefix=f"{prefix}.self_attn",
         )
 
@@ -338,7 +334,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: Step3TextDecoderLayer(
-                config=vllm_config.model_config,
+                config=config,
                 cache_config=cache_config,
                 quant_config=quant_config,
                 prefix=prefix,
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index 5d16be1eb312..3e55ada0ed2e 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -20,6 +20,7 @@
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.conv import Conv2dLayer
 from vllm.model_executor.layers.linear import (
     ColumnParallelLinear,
     QKVParallelLinear,
@@ -42,8 +43,8 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs import Step3VisionEncoderConfig
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 
 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
@@ -320,7 +321,7 @@ class Step3VLProcessor:
     def __init__(
         self,
         config: PretrainedConfig,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike,
     ) -> None:
         super().__init__()
 
@@ -667,7 +668,7 @@ def __init__(self, config: Step3VisionEncoderConfig):
 
         self.class_embedding = nn.Parameter(torch.randn(1, self.embed_dim))
 
-        self.patch_embedding = nn.Conv2d(
+        self.patch_embedding = Conv2dLayer(
             in_channels=config.num_channels,
             out_channels=self.embed_dim,
             kernel_size=self.patch_size,
@@ -950,13 +951,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                 prefix=maybe_prefix(prefix, "vision_model"),
                 use_data_parallel=self.use_data_parallel,
             )
-            self.vit_downsampler = nn.Conv2d(
+            self.vit_downsampler = Conv2dLayer(
                 config.vision_config.hidden_size,
                 config.vision_config.output_hidden_size,
                 kernel_size=2,
                 stride=config.understand_projector_stride,
             )
-            self.vit_downsampler2 = nn.Conv2d(
+            self.vit_downsampler2 = Conv2dLayer(
                 config.vision_config.output_hidden_size,
                 config.vision_config.output_hidden_size * 2,
                 kernel_size=3,
diff --git a/vllm/model_executor/models/teleflm.py b/vllm/model_executor/models/teleflm.py
index 8a0bec9dff84..bebd7bcaa924 100644
--- a/vllm/model_executor/models/teleflm.py
+++ b/vllm/model_executor/models/teleflm.py
@@ -74,5 +74,5 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.output_mult = self.config.output_mult / self.mup_scale_factor
             logit_scale = self.output_mult
             self.logits_processor = LogitsProcessor(
-                self.unpadded_vocab_size, self.config.vocab_size, logit_scale
+                self.config.vocab_size, scale=logit_scale
             )
diff --git a/vllm/model_executor/models/transformers/__init__.py b/vllm/model_executor/models/transformers/__init__.py
index 365b5eb08893..93cd8ff50766 100644
--- a/vllm/model_executor/models/transformers/__init__.py
+++ b/vllm/model_executor/models/transformers/__init__.py
@@ -120,8 +120,8 @@ def __getattr__(name: str):
     """Handle imports of non-existent classes with a helpful error message."""
     if name not in globals():
         raise AttributeError(
-            "The Transformers backend does not currently have a class to handle "
-            f"the requested model type: {name}. Please open an issue at "
+            "The Transformers modeling backend does not currently have a class to "
+            f"handle the requested model type: {name}. Please open an issue at "
             "https://github.com/vllm-project/vllm/issues/new"
         )
     return globals()[name]
diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py
index 63096e57f8ee..f3ebc6da8e30 100644
--- a/vllm/model_executor/models/transformers/base.py
+++ b/vllm/model_executor/models/transformers/base.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend base class."""
+"""Transformers modeling backend base class."""
 
 from collections.abc import Iterable
 from typing import TYPE_CHECKING
@@ -27,7 +27,8 @@
 from transformers import AutoModel
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
-from vllm.attention import Attention, AttentionType
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention
 from vllm.attention.layers.encoder_only_attention import EncoderOnlyAttention
 from vllm.config.utils import getattr_iter
 from vllm.distributed import get_pp_group, get_tp_group
@@ -92,7 +93,6 @@ def vllm_flash_attention_forward(
 
 
 class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP):
-    embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"]  # TODO transformers will have a util to get it
     hf_to_vllm_mapper = WeightsMapper(
         orig_to_new_prefix={
@@ -118,7 +118,7 @@ def __init_subclass__(cls, *args, **kwargs):
 
     def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
         super().__init__()
-        logger.info("Using Transformers backend.")
+        logger.info("Using Transformers modeling backend.")
 
         self.config = vllm_config.model_config.hf_config
         self.text_config = self.config.get_text_config()
@@ -147,7 +147,8 @@ def __init__(self, *, vllm_config: "VllmConfig", prefix: str = ""):
             # Check for unsupported quantization methods.
             if quant_method_name == "mxfp4":
                 raise NotImplementedError(
-                    "Transformers backend does not support MXFP4 quantization yet."
+                    "Transformers modeling backend does "
+                    "not support MXFP4 quantization yet."
                 )
             # Skip loading extra bias for GPTQ models.
             if "gptq" in quant_method_name:
@@ -458,6 +459,6 @@ def check_version(min_version: str, feature: str):
         required = Version(min_version)
         if installed < required:
             raise ImportError(
-                f"Transformers backend requires transformers>={required} "
+                f"Transformers modeling backend requires transformers>={required} "
                 f"for {feature}, but got {installed}"
             )
diff --git a/vllm/model_executor/models/transformers/causal.py b/vllm/model_executor/models/transformers/causal.py
index 42fd11117c73..b2865ed0c7ff 100644
--- a/vllm/model_executor/models/transformers/causal.py
+++ b/vllm/model_executor/models/transformers/causal.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for causal language models."""
+"""Transformers modeling backend mixin for causal language models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/legacy.py b/vllm/model_executor/models/transformers/legacy.py
index a453870a2687..aca630be5615 100644
--- a/vllm/model_executor/models/transformers/legacy.py
+++ b/vllm/model_executor/models/transformers/legacy.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for legacy models."""
+"""Transformers modeling backend mixin for legacy models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/moe.py b/vllm/model_executor/models/transformers/moe.py
index 8e39eb0b9902..31db9d682bd4 100644
--- a/vllm/model_executor/models/transformers/moe.py
+++ b/vllm/model_executor/models/transformers/moe.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for Mixture of Experts (MoE) models."""
+"""Transformers modeling backend mixin for Mixture of Experts (MoE) models."""
 
 from typing import TYPE_CHECKING, Any
 
@@ -39,7 +39,7 @@
 
 @CustomOp.register("transformers_fused_moe")
 class TransformersFusedMoE(FusedMoE):
-    """Custom FusedMoE for the Transformers backend."""
+    """Custom FusedMoE for the Transformers modeling backend."""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -256,7 +256,14 @@ def forward(self, *args, **kwargs):
         def _recursive_replace(module: nn.Module, prefix: str):
             for child_name, child_module in module.named_children():
                 qual_name = maybe_prefix(prefix, child_name)
-                if child_name == "experts" and isinstance(child_module, nn.ModuleList):
+                # Naive implementations will have experts as ModuleList
+                is_modulelist = isinstance(child_module, nn.ModuleList)
+                # Packed implementations will have experts as 3D tensors of shapes like:
+                # gate_up_proj = (num_experts, 2 * intermediate_size, hidden_size)
+                # down_proj = (num_experts, intermediate_size, hidden_size)
+                params = list(child_module.parameters())
+                is_3d = len(params) > 0 and all(p.ndim == 3 for p in params)
+                if child_name == "experts" and (is_modulelist or is_3d):
                     # Alias for readability
                     mlp = module
                     experts = child_module
diff --git a/vllm/model_executor/models/transformers/multimodal.py b/vllm/model_executor/models/transformers/multimodal.py
index 9b0463f41fa8..ccf605371987 100644
--- a/vllm/model_executor/models/transformers/multimodal.py
+++ b/vllm/model_executor/models/transformers/multimodal.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixin for multi-modal models."""
+"""Transformers modeling backend mixin for multi-modal models."""
 
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
@@ -310,9 +310,9 @@ def forward(
         return model_output
 
     def get_language_model(self) -> torch.nn.Module:
-        """Transformers backend multimodal classes do not contain a separate vLLM
-        language model class. Therefore, in order to return a language model vLLM class,
-        we use a wrapper to give `self` the same interface as a text model."""
+        """Transformers modeling backend multimodal classes do not contain a separate
+        vLLM language model class. Therefore, in order to return a language model vLLM
+        class, we use a wrapper to give `self` the same interface as a text model."""
 
         # Exclude self and object
         bases = self.__class__.mro()[1:-1]
@@ -385,7 +385,9 @@ def get_mrope_input_positions(
             for k, v in kwargs.items()
             if k not in {"image_grid_thw", "video_grid_thw"}
         ):
-            raise NotImplementedError("Transformers backend only supports images.")
+            raise NotImplementedError(
+                "Transformers modeling backend only supports images."
+            )
 
         image_grid_thw = kwargs.get("image_grid_thw", [])
         video_grid_thw = kwargs.get("video_grid_thw", [])
diff --git a/vllm/model_executor/models/transformers/pooling.py b/vllm/model_executor/models/transformers/pooling.py
index 8117bbac013e..4c2a74bccb6a 100644
--- a/vllm/model_executor/models/transformers/pooling.py
+++ b/vllm/model_executor/models/transformers/pooling.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend mixins for pooling models."""
+"""Transformers modeling backend mixins for pooling models."""
 
 from typing import TYPE_CHECKING
 
diff --git a/vllm/model_executor/models/transformers/utils.py b/vllm/model_executor/models/transformers/utils.py
index 267a6e06e6bb..b807f45b5d52 100644
--- a/vllm/model_executor/models/transformers/utils.py
+++ b/vllm/model_executor/models/transformers/utils.py
@@ -14,7 +14,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Transformers backend utilities."""
+"""Transformers modeling backend utilities."""
 
 from contextlib import contextmanager
 from pathlib import Path
@@ -22,6 +22,7 @@
 
 import torch
 from torch import nn
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 
 from vllm.config.utils import getattr_iter
 from vllm.logger import init_logger
@@ -203,5 +204,10 @@ def can_enable_torch_compile(vllm_config: "VllmConfig") -> bool:
     """
     text_config = vllm_config.model_config.hf_config.get_text_config()
     # Dynamic rope scaling is not compatible with torch.compile
-    rope_scaling: dict = getattr(text_config, "rope_scaling", None) or {}
-    return rope_scaling.get("rope_type") != "dynamic"
+    rope_parameters: dict | None = getattr(text_config, "rope_parameters", None) or {}
+    if rope_parameters:
+        # Nest rope_parameters if not nested already to simplify logic
+        if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+            rope_parameters = {"": rope_parameters}
+        return all(rp["rope_type"] != "dynamic" for rp in rope_parameters.values())
+    return True
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index bb0f6bd036f1..26a8355cd22b 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -116,7 +116,12 @@ def get_hf_processor(self, **kwargs: object) -> ProcessorMixin:
 
     def get_feature_extractor(self, **kwargs: object) -> WhisperFeatureExtractor:
         hf_processor = self.get_hf_processor(**kwargs)
+
+        # Changed in https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/commit/9a3c571b8fdaf1e66dd3ea61bbcb6db5c70a438e
         audio_processor = hf_processor.audio_processor  # type: ignore
+        if isinstance(audio_processor, WhisperFeatureExtractor):
+            return audio_processor
+
         feature_extractor = audio_processor.feature_extractor  # type: ignore
         assert isinstance(feature_extractor, WhisperFeatureExtractor)
         return feature_extractor
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index f14b79f2886c..f25ab9153a50 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -10,7 +10,6 @@
 import torch.nn as nn
 from torch.func import functional_call
 from transformers import PretrainedConfig
-from typing_extensions import deprecated
 
 from vllm.config import VllmConfig
 from vllm.distributed import (
@@ -18,7 +17,14 @@
     get_tensor_model_parallel_world_size,
 )
 from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+)
+from vllm.model_executor.model_loader.online_quantization import (
+    support_quantized_model_reload_from_hp_weights,
+)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.interfaces import supports_any_eagle
 from vllm.multimodal import NestedTensors
 from vllm.sequence import IntermediateTensors
 from vllm.utils.math_utils import cdiv
@@ -117,9 +123,10 @@ class AutoWeightsLoader:
     environment variable `VLLM_LOGGING_LEVEL=DEBUG`.
     """
 
-    # Models trained using early version ColossalAI
-    # may include these tensors in checkpoint. Skip them.
+    # Models trained using early version ColossalAI or quantized by
+    # GPTQModel may include these tensors in checkpoint. Skip them.
     ROTARY_EMBEDS_UNUSED_WEIGHTS = [
+        "rotary_pos_emb.inv_freq",
         "rotary_emb.inv_freq",
         "rotary_emb.cos_cached",
         "rotary_emb.sin_cached",
@@ -311,6 +318,7 @@ def _load_module(
                 )
                 raise ValueError(msg)
 
+    @support_quantized_model_reload_from_hp_weights
     def load_weights(
         self,
         weights: Iterable[tuple[str, torch.Tensor]],
@@ -472,54 +480,6 @@ def _merge_multimodal_embeddings(
     return inputs_embeds
 
 
-@deprecated(
-    "`merge_multimodal_embeddings` has been replaced with "
-    "`SupportsMultiModal.embed_input_ids` and will be "
-    "removed in v0.12."
-)
-def merge_multimodal_embeddings(
-    input_ids: torch.Tensor,
-    inputs_embeds: torch.Tensor,
-    multimodal_embeddings: NestedTensors,
-    placeholder_token_id: int | list[int],
-) -> torch.Tensor:
-    """
-    Merge `multimodal_embeddings` into `inputs_embeds` by overwriting the
-    positions in `inputs_embeds` corresponding to placeholder tokens in
-    `input_ids`.
-
-    `placeholder_token_id` can be a list of token ids (e.g, token ids
-    of img_start, img_break, and img_end tokens) when needed: This means
-    the order of these tokens in the `input_ids` MUST MATCH the order of
-    their embeddings in `multimodal_embeddings` since we need to
-    slice-merge instead of individually scattering.
-
-    For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where
-    - T is text token
-    - S is image start token
-    - I is image embedding token
-    - B is image break token
-    - E is image end token.
-
-    Then the image embeddings (that correspond to I's) from vision encoder
-    must be padded with embeddings of S, B, and E in the same order of
-    input_ids for a correct embedding merge.
-
-    Note:
-        This updates `inputs_embeds` in place.
-    """
-    if isinstance(placeholder_token_id, list):
-        is_multimodal = isin_list(input_ids, placeholder_token_id)
-    else:
-        is_multimodal = input_ids == placeholder_token_id
-
-    return _merge_multimodal_embeddings(
-        inputs_embeds,
-        multimodal_embeddings=multimodal_embeddings,
-        is_multimodal=is_multimodal,
-    )
-
-
 def isin_list(
     elements: torch.Tensor,
     test_elements_list: list[int],
@@ -713,6 +673,30 @@ def maybe_prefix(prefix: str, name: str) -> str:
     return name if not prefix else f"{prefix}.{name}"
 
 
+def get_draft_quant_config(
+    vllm_config: VllmConfig,
+) -> QuantizationConfig | None:
+    """Get quantization config for Draft models.
+
+    Draft models should use their own quantization config instead of the verifier/target
+    model's config. This helper retrieves the draft model's quantization config.
+
+    Args:
+        vllm_config: The vLLM configuration object.
+
+    Returns:
+        The draft model's config if available, None otherwise.
+    """
+    draft_model_config = vllm_config.speculative_config.draft_model_config
+    draft_load_config = vllm_config.load_config
+
+    return (
+        VllmConfig.get_quantization_config(draft_model_config, draft_load_config)
+        if draft_model_config
+        else None
+    )
+
+
 def extract_layer_index(layer_name: str, num_attn_module: int = 1) -> int:
     """
     Extract the layer index from the module name.
@@ -824,3 +808,25 @@ def sequence_parallel_chunk_impl_fake(x: torch.Tensor) -> torch.Tensor:
     fake_impl=sequence_parallel_chunk_impl_fake,
     tags=(torch.Tag.needs_fixed_stride_order,),
 )
+
+
+def process_eagle_weight(
+    model: nn.Module,
+    name: str,
+) -> None:
+    """
+    Update EAGLE model flags based on loaded weight name.
+    This should be called during weight loading to detect if a model
+    has its own lm_head or embed_tokens weight.
+    Args:
+        model: The model instance (must support EAGLE)
+        name: The name of the weight to process
+    """
+    if not supports_any_eagle(model):
+        return
+
+    # To prevent overriding with target model's layers
+    if "lm_head" in name:
+        model.has_own_lm_head = True
+    if "embed_tokens" in name:
+        model.has_own_embed_tokens = True
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
index 0e814e5c86ad..e5d70eb7bc2f 100644
--- a/vllm/model_executor/models/vision.py
+++ b/vllm/model_executor/models/vision.py
@@ -550,19 +550,3 @@ def get_llm_pos_ids_for_vision(
     llm_pos_ids_list.append(_llm_pos_ids + start_idx)
     llm_pos_ids = torch.cat(llm_pos_ids_list, dim=1)
     return llm_pos_ids
-
-
-# Due to a performance regression with Conv3D in PyTorch2.9, we reshape
-# Conv3D weights to Linear weights for better performance.
-# See: https://github.com/vllm-project/vllm/issues/27406
-# and https://github.com/pytorch/pytorch/issues/166122
-# FIXME(Isotr0py): Revert the PR introduces this workaround
-# (https://github.com/vllm-project/vllm/pull/27418),
-# once the performance issue is resolved in PyTorch.
-def conv3d_to_linear_weight(conv3d_weight: torch.Tensor) -> torch.Tensor:
-    """
-    Reshape Conv3D weight to Linear weight. Only work when kernel_size==stride.
-    """
-    out_channels, in_channels, kt, kh, kw = conv3d_weight.shape
-    linear_weight = conv3d_weight.reshape(out_channels, in_channels * kt * kh * kw)
-    return linear_weight
diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py
index 18ad8851fccd..0a39ea7ef5bf 100644
--- a/vllm/model_executor/models/voxtral.py
+++ b/vllm/model_executor/models/voxtral.py
@@ -51,10 +51,8 @@
 )
 from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.tokenizer import (
-    MistralTokenizer,
-    cached_tokenizer_from_config,
-)
+from vllm.tokenizers import MistralTokenizer
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsTranscription
 from .utils import init_vllm_registered_model, maybe_prefix
diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py
index 91a10b95a08c..c72b5e1c091f 100644
--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -16,8 +16,8 @@
 )
 from transformers.models.whisper.modeling_whisper import sinusoids
 
-from vllm.attention import Attention, AttentionType
-from vllm.attention.layer import MultiHeadAttention
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.layer import Attention, MultiHeadAttention
 from vllm.attention.layers.cross_attention import CrossAttention
 from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -599,15 +599,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
     def forward(
         self,
-        input_features: torch.Tensor | list[torch.Tensor] | None,
         input_ids: torch.Tensor | None,
         positions: torch.Tensor,
+        encoder_outputs: list[torch.Tensor],
     ) -> torch.Tensor:
-        encoder_outputs = self.get_encoder_outputs(input_features)
+        assert len(encoder_outputs) in (0, 1)
+        enc_states = encoder_outputs[0] if len(encoder_outputs) == 1 else None
         decoder_outputs = self.decoder(
             input_ids=input_ids,
             positions=positions,
-            encoder_hidden_states=encoder_outputs,
+            encoder_hidden_states=enc_states,
         )
         return decoder_outputs
 
@@ -894,13 +895,15 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
+        encoder_outputs: list[torch.Tensor] | None = None,
         **kwargs,
     ) -> torch.Tensor:
-        audio_input = self._parse_and_validate_audio_input(**kwargs)
+        if encoder_outputs is None:
+            encoder_outputs = []
         decoder_outputs = self.model(
-            input_features=audio_input["input_features"],
             input_ids=input_ids,
             positions=positions,
+            encoder_outputs=encoder_outputs,
         )
         return decoder_outputs
 
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py
index 64e6979c8fcf..653b5b9beef7 100644
--- a/vllm/model_executor/models/zamba2.py
+++ b/vllm/model_executor/models/zamba2.py
@@ -128,7 +128,6 @@ def __init__(
         tp_size = get_tensor_model_parallel_world_size()
         self.config = config
         self.num_hybrid_layers = num_hybrid_layers
-        self.rope_theta = config.rope_theta
 
         self.attention_hidden_size = config.attention_hidden_size
         self.total_num_attention_heads = config.num_attention_heads
@@ -233,8 +232,7 @@ def __init__(
                 head_size=self.attention_head_dim,
                 rotary_dim=self.attention_head_dim,
                 max_position=config.max_position_embeddings,
-                base=self.rope_theta,
-                rope_scaling=None,
+                rope_parameters=config.rope_parameters,
                 is_neox_style=True,
             )
 
@@ -567,11 +565,7 @@ def forward(
         hidden_states = self.input_layernorm(hidden_states)
 
         # Process through Mamba mixer
-        output = torch.empty_like(hidden_states)
-        self.mamba(
-            hidden_states,
-            output,
-        )
+        output = self.mamba(hidden_states)
 
         # residual connection after mamba
         hidden_states = residual + output
diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py
index 759b809433b1..8aad59e84ff2 100644
--- a/vllm/model_executor/utils.py
+++ b/vllm/model_executor/utils.py
@@ -10,7 +10,7 @@
 from vllm.utils.torch_utils import is_torch_equal_or_newer
 
 
-def set_random_seed(seed: int) -> None:
+def set_random_seed(seed: int | None) -> None:
     from vllm.platforms import current_platform
 
     current_platform.seed_everything(seed)
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 53052ddc6343..b93a42ffd24c 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -7,6 +7,8 @@
 
 import numpy as np
 import numpy.typing as npt
+import pybase64
+import torch
 
 from vllm.utils.import_utils import PlaceholderModule
 
@@ -116,3 +118,25 @@ def encode_base64(self, media: tuple[npt.NDArray, int]) -> str:
             data = buffer.getvalue()
 
         return base64.b64encode(data).decode("utf-8")
+
+
+class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def load_bytes(self, data: bytes) -> torch.Tensor:
+        buffer = BytesIO(data)
+        return torch.load(buffer, weights_only=True)
+
+    def load_base64(self, media_type: str, data: str) -> torch.Tensor:
+        return self.load_bytes(pybase64.b64decode(data, validate=True))
+
+    def load_file(self, filepath: Path) -> torch.Tensor:
+        return torch.load(filepath, weights_only=True)
+
+    def encode_base64(self, media: torch.Tensor) -> str:
+        buffer = BytesIO()
+        torch.save(media, buffer)
+        buffer.seek(0)
+        binary_data = buffer.read()
+        return pybase64.b64encode(binary_data).decode("utf-8")
diff --git a/vllm/multimodal/evs.py b/vllm/multimodal/evs.py
index 4a288d2d238c..8a36ea415da4 100644
--- a/vllm/multimodal/evs.py
+++ b/vllm/multimodal/evs.py
@@ -185,7 +185,7 @@ def recompute_mrope_positions(
 
     Args:
         input_ids: (N,) All input tokens of the prompt (entire sequence).
-        multimodal_positions: List of mrope positsions for each media.
+        multimodal_positions: List of mrope positions for each media.
         mrope_positions: Existing mrope positions (4, N) for entire sequence.
         num_computed_tokens: A number of computed tokens so far.
         vision_start_token_id: Token indicating start of vision media.
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 2fa3f6ebcc11..810f29072a0f 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -359,8 +359,9 @@ def __init__(
         )
         self.video_needs_metadata = video_needs_metadata
 
-    def _is_embeddings(
-        self, data: object
+    @classmethod
+    def is_embeddings(
+        cls, data: object
     ) -> TypeGuard[torch.Tensor | list[torch.Tensor]]:
         if isinstance(data, torch.Tensor):
             return data.ndim == 3
@@ -420,7 +421,7 @@ def _parse_audio_data(
         ):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return AudioEmbeddingItems(data)
 
         data_items: list[AudioItem]
@@ -458,7 +459,7 @@ def _parse_image_data(
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return ImageEmbeddingItems(data)
 
         if (
@@ -484,7 +485,7 @@ def _parse_video_data(
         if self._is_empty(data):
             return None
 
-        if self._is_embeddings(data):
+        if self.is_embeddings(data):
             return VideoEmbeddingItems(data)
 
         data_items: list[VideoItem]
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 85a03efd5bb9..912cff2343dd 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -23,8 +23,9 @@
 from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.processor import cached_processor_from_config
-from vllm.transformers_utils.tokenizer import AnyTokenizer, decode_tokens, encode_tokens
+from vllm.transformers_utils.tokenizer import decode_tokens, encode_tokens
 from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
@@ -76,7 +77,7 @@
 
 @lru_cache(maxsize=2048)
 def _cached_encode(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     text: str,
     *,
     add_special_tokens: bool | None = None,
@@ -86,7 +87,7 @@ def _cached_encode(
 
 @lru_cache(maxsize=2048)
 def _cached_decode(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: tuple[int, ...],
     *,
     skip_special_tokens: bool | None = None,
@@ -96,15 +97,37 @@ def _cached_decode(
     )
 
 
-def _seq2text(tokenizer: AnyTokenizer, seq: PromptSeq) -> str:
+def _seq2text(
+    tokenizer: TokenizerLike | None,
+    seq: PromptSeq,
+    *,
+    use_cache: bool = True,
+) -> str:
     if isinstance(seq, str):
         return seq
 
+    if tokenizer is None:
+        raise ValueError("You cannot decode tokens when `skip_tokenizer_init=True`")
+
+    if not use_cache:
+        return decode_tokens(tokenizer, seq)
+
     return _cached_decode(tokenizer, tuple(seq))
 
 
-def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
+def _seq2tokens(
+    tokenizer: TokenizerLike | None,
+    seq: PromptSeq,
+    *,
+    use_cache: bool = True,
+) -> list[int]:
     if isinstance(seq, str):
+        if tokenizer is None:
+            raise ValueError("You cannot encode text when `skip_tokenizer_init=True`")
+
+        if not use_cache:
+            return encode_tokens(tokenizer, seq, add_special_tokens=False)
+
         return _cached_encode(tokenizer, seq, add_special_tokens=False)
 
     return seq
@@ -113,7 +136,7 @@ def _seq2tokens(tokenizer: AnyTokenizer, seq: PromptSeq) -> list[int]:
 class _GetMatchIndex(Protocol):
     def __call__(
         self,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         prompt: PromptSeq,
         start_idx: int = 0,
     ) -> int | None: ...
@@ -143,7 +166,7 @@ def prefix(seq: PromptSeq) -> PromptIndex:
         """
 
         def get_match_index(
-            tokenizer: AnyTokenizer,
+            tokenizer: TokenizerLike | None,
             prompt: PromptSeq,
             start_idx: int = 0,
         ) -> int | None:
@@ -153,13 +176,11 @@ def get_match_index(
             prefix = seq
 
             if isinstance(prompt, str):
-                if not isinstance(prefix, str):
-                    # Make both `str`
-                    prefix = decode_tokens(tokenizer, prefix)
+                # Make both `str`
+                prefix = _seq2text(tokenizer, prefix, use_cache=False)
             else:
-                if isinstance(prefix, str):
-                    # Make both `list[int]`
-                    prefix = encode_tokens(tokenizer, prefix, add_special_tokens=False)
+                # Make both `list[int]`
+                prefix = _seq2tokens(tokenizer, prefix, use_cache=False)
 
             match_idx = len(prefix)
             return match_idx if prompt[:match_idx] == prefix else None
@@ -199,7 +220,7 @@ class PromptUpdateDetails(Generic[_S]):
     full: _S
     """The full content."""
 
-    is_embed: Callable[[AnyTokenizer, PromptSeq], torch.Tensor] | None = None
+    is_embed: Callable[[TokenizerLike | None, PromptSeq], torch.Tensor] | None = None
     """
     Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
     return a boolean mask of shape `(len(full),)` indicating which positions
@@ -220,8 +241,8 @@ def select_text(
         seq: _S,
         embed_text: str,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
-            embed_token_ids = encode_tokens(tokenizer, embed_text)
+        def is_embed(tokenizer: TokenizerLike | None, full: PromptSeq) -> torch.Tensor:
+            embed_token_ids = _seq2tokens(tokenizer, embed_text, use_cache=False)
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.isin(
@@ -236,7 +257,7 @@ def select_token_id(
         seq: _S,
         embed_token_id: int,
     ) -> "PromptUpdateDetails[_S]":
-        def is_embed(tokenizer: AnyTokenizer, full: PromptSeq) -> torch.Tensor:
+        def is_embed(tokenizer: TokenizerLike | None, full: PromptSeq) -> torch.Tensor:
             token_ids = _seq2tokens(tokenizer, full)
 
             return torch.tensor(token_ids) == embed_token_id
@@ -522,7 +543,7 @@ class ResolvedPromptUpdate:
     def iter_token_matches(
         self,
         prompt: list[int],
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -544,7 +565,7 @@ def iter_token_matches(
     def iter_text_matches(
         self,
         prompt: str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -566,7 +587,7 @@ def iter_text_matches(
     def iter_matches(
         self,
         prompt: list[int] | str,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         *,
         start_idx: int = 0,
     ) -> Generator[PromptTargetMatch]:
@@ -675,7 +696,7 @@ def to_range(self) -> PlaceholderRange:
 def _find_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike | None,
     *,
     prev_end_idx: int = 0,
     current_result: "MultiModalPromptUpdatesApplyResult",
@@ -727,22 +748,37 @@ def _find_matches(
     return mode, matches_to_apply
 
 
+def _all_items_found(
+    mm_item_counts: dict[str, int],
+    mm_found_counts: dict[str, int],
+) -> bool:
+    return all(
+        item_idx >= mm_item_counts[modality]
+        for modality, item_idx in mm_found_counts.items()
+    )
+
+
 def _apply_matches(
     prompt: _S,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[list[_S], "MultiModalPromptUpdatesApplyResult"]:
-    prompt_len = len(prompt)
+    mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
 
     out_seqs = list[str | list[int]]()
     out_result: MultiModalPromptUpdatesApplyResult = {
         m: [None] * len(items) for m, items in mm_prompt_updates.items()
     }
 
-    start_idx = prev_end_idx = 0
-    while start_idx < max(prompt_len, 1):  # Allow inserts into empty prompt
-        found = False
+    # Early exit if no items to find
+    mm_found_counts = {
+        m: sum(r is not None for r in res) for m, res in out_result.items()
+    }
+    if _all_items_found(mm_item_counts, mm_found_counts):
+        return [prompt], out_result
 
+    prev_end_idx = 0
+    while True:
         mode, matches_to_apply = _find_matches(
             prompt,
             mm_prompt_updates,
@@ -751,33 +787,37 @@ def _apply_matches(
             current_result=out_result,
         )
 
-        if mode is not None:
-            for (modality, item_idx), (match, update_idx) in matches_to_apply:
-                found = True
+        if mode is None:
+            break  # No more matches to find
 
-                matched_update = mm_prompt_updates[modality][item_idx][update_idx]
-                matched_content = matched_update.content.full
+        for (modality, item_idx), (match, update_idx) in matches_to_apply:
+            matched_update = mm_prompt_updates[modality][item_idx][update_idx]
+            matched_content = matched_update.content.full
 
-                if mode == UpdateMode.INSERT:
-                    end_idx_to_insert = match.end_idx
-                elif mode == UpdateMode.REPLACE:
-                    end_idx_to_insert = match.start_idx
-                else:
-                    assert_never(mode)
+            if mode == UpdateMode.INSERT:
+                end_idx_to_insert = match.end_idx
+            elif mode == UpdateMode.REPLACE:
+                end_idx_to_insert = match.start_idx
+            else:
+                assert_never(mode)
 
-                out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
-                out_seqs.append(
-                    _seq2text(tokenizer, matched_content)
-                    if isinstance(prompt, str)
-                    else _seq2tokens(tokenizer, matched_content)
-                )
-                out_result[modality][item_idx] = update_idx
+            out_seqs.append(prompt[prev_end_idx:end_idx_to_insert])
+            out_seqs.append(
+                _seq2text(tokenizer, matched_content)
+                if isinstance(prompt, str)
+                else _seq2tokens(tokenizer, matched_content)
+            )
+            out_result[modality][item_idx] = update_idx
 
-                # Exclude overlapping matches
-                start_idx = prev_end_idx = match.end_idx
+            # Exclude overlapping matches
+            prev_end_idx = match.end_idx
 
-        if not found:
-            start_idx += 1
+        # Early exit if all items found
+        mm_found_counts = {
+            m: sum(r is not None for r in res) for m, res in out_result.items()
+        }
+        if _all_items_found(mm_item_counts, mm_found_counts):
+            break
 
     out_seqs.append(prompt[prev_end_idx:])
 
@@ -787,7 +827,7 @@ def _apply_matches(
 def apply_token_matches(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[list[int], "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -804,7 +844,7 @@ def apply_token_matches(
 def apply_text_matches(
     prompt: str,
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike | None,
 ) -> tuple[str, "MultiModalPromptUpdatesApplyResult"]:
     """
     Apply the updates in `mm_prompt_updates` to `prompt`.
@@ -821,7 +861,7 @@ def apply_text_matches(
 def _iter_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike | None,
 ) -> Iterable[PlaceholderFeaturesInfo]:
     """
     Yield each set of placeholder tokens found in `prompt`.
@@ -832,12 +872,15 @@ def _iter_placeholders(
 
     Note that empty matches are ignored.
     """
-    prompt_len = len(prompt)
     mm_item_counts = {m: len(items) for m, items in mm_prompt_updates.items()}
+    item_idx_by_modality = {modality: 0 for modality in mm_prompt_updates}
 
-    item_idx_by_modality = defaultdict[str, int](lambda: 0)
+    if _all_items_found(mm_item_counts, item_idx_by_modality):
+        return
 
+    prompt_len = len(prompt)
     start_idx = 0
+
     while start_idx < prompt_len:
         found = False
 
@@ -875,6 +918,9 @@ def _iter_placeholders(
                     break
 
             if found:
+                if _all_items_found(mm_item_counts, item_idx_by_modality):
+                    return
+
                 break  # Go back to the outer while loop
 
         if not found:
@@ -884,7 +930,7 @@ def _iter_placeholders(
 def find_mm_placeholders(
     prompt: list[int],
     mm_prompt_updates: "MultiModalPromptUpdates",
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike | None,
 ) -> Mapping[str, list[PlaceholderFeaturesInfo]]:
     it = _iter_placeholders(prompt, mm_prompt_updates, tokenizer)
     return dict(full_groupby_modality(it))
@@ -905,9 +951,17 @@ class InputProcessingContext:
     model_config: ModelConfig
     """The configuration of the model."""
 
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike | None
     """The tokenizer used to tokenize the inputs."""
 
+    def get_tokenizer(self) -> TokenizerLike:
+        if self.tokenizer is None:
+            raise ValueError(
+                "You cannot pass text prompts when `skip_tokenizer_init=True`"
+            )
+
+        return self.tokenizer
+
     @overload
     def get_hf_config(self, /) -> PretrainedConfig: ...
 
@@ -1121,8 +1175,8 @@ def __init__(self, ctx: InputProcessingContext) -> None:
     def model_id(self) -> str:
         return self.ctx.model_config.model
 
-    def get_tokenizer(self) -> AnyTokenizer:
-        return self.ctx.tokenizer
+    def get_tokenizer(self) -> TokenizerLike:
+        return self.ctx.get_tokenizer()
 
     def get_hf_config(self) -> PretrainedConfig:
         return self.ctx.get_hf_config()
@@ -1934,15 +1988,11 @@ def _apply_prompt_updates(
             for update_idxs in match_result.values()
         ):
             new_text, match_result = self._apply_text_matches(
-                decode_tokens(tokenizer, token_ids),
+                _seq2text(tokenizer, token_ids, use_cache=False),
                 mm_prompt_updates,
             )
 
-            new_token_ids = encode_tokens(
-                tokenizer,
-                new_text,
-                add_special_tokens=False,
-            )
+            new_token_ids = _seq2tokens(tokenizer, new_text, use_cache=False)
 
         matched_updates = defaultdict[str, list[Sequence[ResolvedPromptUpdate]]](list)
         for modality, update_idxs in match_result.items():
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 8f9276e84640..2fdae46e547b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,14 +2,12 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections.abc import Mapping
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, Protocol, TypeVar
-
-import torch.nn as nn
+from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
 
 from vllm.config.multimodal import BaseDummyOptions
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import AnyTokenizer, cached_tokenizer_from_config
-from vllm.utils.collection_utils import ClassRegistry
+from vllm.tokenizers import TokenizerLike
+from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
 
 from .cache import BaseMultiModalProcessorCache
 from .processing import (
@@ -26,10 +24,11 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
+    from vllm.model_executor.models.interfaces import SupportsMultiModal
 
 logger = init_logger(__name__)
 
-N = TypeVar("N", bound=type[nn.Module])
+N = TypeVar("N", bound=type["SupportsMultiModal"])
 _I = TypeVar("_I", bound=BaseProcessingInfo)
 _I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
 
@@ -95,9 +94,6 @@ class MultiModalRegistry:
     A registry that dispatches data processing according to the model.
     """
 
-    def __init__(self) -> None:
-        self._processor_factories = ClassRegistry[nn.Module, _ProcessorFactories]()
-
     def _extract_mm_options(
         self,
         model_config: "ModelConfig",
@@ -207,7 +203,7 @@ def register_processor(
         """
 
         def wrapper(model_cls: N) -> N:
-            if self._processor_factories.contains(model_cls, strict=True):
+            if "_processor_factory" in model_cls.__dict__:
                 logger.warning(
                     "Model class %s already has a multi-modal processor "
                     "registered to %s. It is overwritten by the new one.",
@@ -215,7 +211,7 @@ def wrapper(model_cls: N) -> N:
                     self,
                 )
 
-            self._processor_factories[model_cls] = _ProcessorFactories(
+            model_cls._processor_factory = _ProcessorFactories(
                 info=info,
                 dummy_inputs=dummy_inputs,
                 processor=processor,
@@ -225,30 +221,32 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def _get_model_cls(self, model_config: "ModelConfig"):
+    def _get_model_cls(self, model_config: "ModelConfig") -> "SupportsMultiModal":
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
         model_cls, _ = get_model_architecture(model_config)
-        return model_cls
+        assert hasattr(model_cls, "_processor_factory")
+        return cast("SupportsMultiModal", model_cls)
 
     def _create_processing_ctx(
         self,
         model_config: "ModelConfig",
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ) -> InputProcessingContext:
         if tokenizer is None and not model_config.skip_tokenizer_init:
             tokenizer = cached_tokenizer_from_config(model_config)
+
         return InputProcessingContext(model_config, tokenizer)
 
     def _create_processing_info(
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
     ) -> BaseProcessingInfo:
         model_cls = self._get_model_cls(model_config)
-        factories = self._processor_factories[model_cls]
+        factories = model_cls._processor_factory
         ctx = self._create_processing_ctx(model_config, tokenizer)
         return factories.info(ctx)
 
@@ -256,7 +254,7 @@ def create_processor(
         self,
         model_config: "ModelConfig",
         *,
-        tokenizer: AnyTokenizer | None = None,
+        tokenizer: TokenizerLike | None = None,
         cache: BaseMultiModalProcessorCache | None = None,
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
@@ -266,7 +264,7 @@ def create_processor(
             raise ValueError(f"{model_config.model} is not a multimodal model")
 
         model_cls = self._get_model_cls(model_config)
-        factories = self._processor_factories[model_cls]
+        factories = model_cls._processor_factory
 
         ctx = self._create_processing_ctx(model_config, tokenizer)
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3f55c46ca334..1020554e2e07 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,7 @@
 
 import asyncio
 import atexit
-from collections.abc import Iterable, Set
+from collections.abc import Generator, Set
 from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
 from pathlib import Path
@@ -22,7 +22,7 @@
 from vllm.utils.jsontree import json_map_leaves
 from vllm.utils.registry import ExtensionManager
 
-from .audio import AudioMediaIO
+from .audio import AudioEmbeddingMediaIO, AudioMediaIO
 from .base import MediaIO
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
 from .video import VideoMediaIO
@@ -342,6 +342,17 @@ def fetch_image_embedding(
 
         return image_embedding_io.load_base64("", data)
 
+    def fetch_audio_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load audio embedding from a URL.
+        """
+        audio_embedding_io = AudioEmbeddingMediaIO()
+
+        return audio_embedding_io.load_base64("", data)
+
 
 def encode_audio_base64(
     audio: np.ndarray,
@@ -403,7 +414,7 @@ def group_mm_kwargs_by_modality(
     pin_memory: bool = False,
     merge_by_field_config: bool | None = None,
     multimodal_cpu_fields: Set[str] = frozenset(),
-) -> Iterable[tuple[str, int, BatchedTensorInputs]]:
+) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
     """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
     modality together into the same `MultiModalKwargs` instance.
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 369c5e6cb4d1..763f90fde7b6 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -63,6 +63,58 @@ def load_bytes(
     ) -> tuple[npt.NDArray, dict[str, Any]]:
         raise NotImplementedError
 
+    @staticmethod
+    def _read_frames(
+        cap,
+        frame_indices: set[int],
+        num_expected_frames: int,
+        max_frame_idx: int,
+    ) -> tuple[npt.NDArray, int, list[int]]:
+        import cv2
+
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames = np.empty((num_expected_frames, height, width, 3), dtype=np.uint8)
+
+        i = 0
+        valid_frame_indices = []
+        for idx in range(max_frame_idx + 1):
+            ok = cap.grab()
+            if not ok:
+                # Frame is broken/unreadable, log warning
+                if idx in frame_indices:
+                    logger.warning(
+                        "Failed to grab frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+                continue
+            if idx in frame_indices:
+                ret, frame = cap.retrieve()
+                if ret:
+                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    valid_frame_indices.append(idx)
+                    i += 1
+                else:
+                    # retrieve() failed even though grab() succeeded
+                    logger.warning(
+                        "Failed to retrieve frame %d during video loading. "
+                        "This frame will be skipped.",
+                        idx,
+                    )
+
+        valid_num_frames = len(valid_frame_indices)
+        if valid_num_frames < num_expected_frames:
+            logger.warning(
+                "Video loading completed with %d broken/unreadable frames. "
+                "Expected %d frames but only loaded %d frames.",
+                num_expected_frames - valid_num_frames,
+                num_expected_frames,
+                valid_num_frames,
+            )
+
+        return frames[:valid_num_frames], valid_num_frames, valid_frame_indices
+
 
 VIDEO_LOADER_REGISTRY = ExtensionManager()
 
@@ -120,24 +172,10 @@ def load_bytes(
             )
             frame_idx = uniform_sampled_frames.tolist()
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_idx), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(max(frame_idx) + 1):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_idx:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == num_frames_to_sample, (
-            f"Expected reading {num_frames_to_sample} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_idx_set = set(frame_idx)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap, frame_idx_set, num_frames_to_sample, max(frame_idx)
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -148,10 +186,10 @@ def load_bytes(
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv",
-            "frames_indices": list(frame_idx),
+            "frames_indices": valid_frame_indices,
             # extra field used to control hf processor's video
             # sampling behavior
-            "do_sample_frames": num_frames_to_sample == total_frames_num,
+            "do_sample_frames": valid_num_frames == total_frames_num,
         }
 
         return frames, metadata
@@ -185,10 +223,10 @@ def load_bytes(
 
         # Refer to:
         # https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
-        frame_indices: range | list[int]
+        frame_indices_list: list[int]
         if duration <= max_duration:
             n = int(math.floor(duration * fps))
-            frame_indices = sorted(
+            frame_indices_list = sorted(
                 {
                     min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
                     for i in range(n)
@@ -197,34 +235,23 @@ def load_bytes(
         else:
             num_samples = int(max_duration * fps)
             if num_samples >= total_frames_num:
-                frame_indices = range(total_frames_num)
+                frame_indices_list = list(range(total_frames_num))
             else:
                 target_seconds = np.linspace(0, duration, num_samples, endpoint=True)
-                frame_indices = sorted(
+                frame_indices_list = sorted(
                     {
                         min(max_frame_idx, int(math.ceil(t * original_fps)))
                         for t in target_seconds
                     }
                 )
 
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        frames = np.empty((len(frame_indices), height, width, 3), dtype=np.uint8)
-
-        i = 0
-        for idx in range(total_frames_num):
-            ok = cap.grab()
-            if not ok:
-                break
-            if idx in frame_indices:
-                ret, frame = cap.retrieve()
-                if ret:
-                    frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                    i += 1
-
-        assert i == len(frame_indices), (
-            f"Expected reading {len(frame_indices)} frames, "
-            f"but only loaded {i} frames from video."
+        # Convert to set for O(1) lookup performance
+        frame_indices_set = set(frame_indices_list)
+        frames, valid_num_frames, valid_frame_indices = cls._read_frames(
+            cap,
+            frame_indices_set,
+            len(frame_indices_list),
+            total_frames_num - 1,
         )
 
         # Use transformers transformers.video_utils.VideoMetadata format
@@ -233,7 +260,7 @@ def load_bytes(
             "fps": original_fps,
             "duration": duration,
             "video_backend": "opencv_dynamic",
-            "frames_indices": list(frame_indices),
+            "frames_indices": valid_frame_indices,
             "do_sample_frames": False,
         }
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 8b3b8d4cb44f..2b2c2f9cdc57 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -14,18 +14,16 @@
 import torch
 
 from vllm import envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import CpuArchEnum, Platform, PlatformEnum
 
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
 
 
@@ -134,9 +132,8 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if selected_backend and selected_backend != AttentionBackendEnum.CPU_ATTN:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         if use_mla:
@@ -192,7 +189,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
         scheduler_config = vllm_config.scheduler_config
         if (
-            scheduler_config.chunked_prefill_enabled
+            scheduler_config.enable_chunked_prefill
             or cache_config.enable_prefix_caching
         ) and cache_config.cache_dtype != "auto":
             raise RuntimeError(
@@ -338,10 +335,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
@@ -388,7 +384,6 @@ def get_allowed_cpu_core_node_list(cls) -> tuple[list[int], list[LogicalCPUInfo]
 
     @classmethod
     def is_pin_memory_available(cls) -> bool:
-        logger.warning("Pin memory is not supported on CPU.")
         return False
 
     @classmethod
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index ebcc290a64cd..4bf9401b6b05 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -15,6 +15,8 @@
 # import custom ops, trigger op registration
 import vllm._C  # noqa
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.import_utils import import_pynvml
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -22,11 +24,9 @@
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
 else:
-    AttentionBackendEnum = None
     VllmConfig = None
     CacheDType = None
 
@@ -48,8 +48,6 @@ def _get_backend_priorities(
     device_capability: DeviceCapability,
 ) -> list[AttentionBackendEnum]:
     """Get backend priorities with lazy import to avoid circular dependency."""
-    from vllm.attention.backends.registry import AttentionBackendEnum
-
     if use_mla:
         if device_capability.major == 10:
             return [
@@ -265,27 +263,18 @@ def get_current_memory_usage(
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
-        # For Blackwell GPUs, force TORCH_SDPA for now.
-        # See https://github.com/facebookresearch/xformers/issues/1317#issuecomment-3199392579 # noqa: E501
-        if cls.has_device_capability(100):
-            return AttentionBackendEnum.TORCH_SDPA
-
-        if dtype not in (torch.float16, torch.bfloat16):
-            return AttentionBackendEnum.XFORMERS
-
-        if cls.has_device_capability(80):
-            backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
-            if backend_class.supports_head_size(
-                head_size
-            ) and backend_class.supports_dtype(dtype):
-                return AttentionBackendEnum.FLASH_ATTN
-            else:
-                return AttentionBackendEnum.XFORMERS
-        else:
-            # Fallback for Volta/Turing GPUs or FA not supported
-            return AttentionBackendEnum.XFORMERS
+        # Try FlashAttention first
+        if (cc := cls.get_device_capability()) and cc.major >= 8:
+            try:
+                backend_class = AttentionBackendEnum.FLASH_ATTN.get_class()
+                if backend_class.supports_head_size(
+                    head_size
+                ) and backend_class.supports_dtype(dtype):
+                    return AttentionBackendEnum.FLASH_ATTN
+            except ImportError:
+                pass
+
+        return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
     def get_valid_backends(
@@ -298,6 +287,7 @@ def get_valid_backends(
         has_sink,
         use_sparse,
         device_capability,
+        attn_type,
     ) -> tuple[
         list[tuple["AttentionBackendEnum", int]],
         dict["AttentionBackendEnum", list[str]],
@@ -318,6 +308,7 @@ def get_valid_backends(
                     has_sink,
                     use_sparse,
                     device_capability,
+                    attn_type,
                 )
             except ImportError:
                 invalid_reasons_i = ["ImportError"]
@@ -339,7 +330,11 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
+        if attn_type is None:
+            attn_type = AttentionType.DECODER
+
         device_capability = cls.get_device_capability()
         assert device_capability is not None
 
@@ -356,6 +351,7 @@ def get_attn_backend_cls(
                     has_sink,
                     use_sparse,
                     device_capability,
+                    attn_type,
                 )
             except ImportError:
                 invalid_reasons = ["ImportError"]
@@ -379,6 +375,7 @@ def get_attn_backend_cls(
             has_sink,
             use_sparse,
             device_capability,
+            attn_type,
         )
         reasons_str = (
             "{"
@@ -405,9 +402,6 @@ def get_attn_backend_cls(
 
         # We have found some valid backends. Select the one with the
         # highest priority.
-        logger.info(
-            "Valid backends: %s", [b[0].name for b in valid_backends_priorities]
-        )
         sorted_indices = sorted(
             range(len(valid_backends_priorities)),
             key=lambda i: valid_backends_priorities[i][1],
@@ -415,8 +409,9 @@ def get_attn_backend_cls(
         selected_index = sorted_indices[0]
         selected_backend = valid_backends_priorities[selected_index][0]
         logger.info(
-            "Using %s backend.",
+            "Using %s attention backend out of potential backends: %s",
             selected_backend.name,
+            [b[0].name for b in valid_backends_priorities],
         )
 
         return selected_backend.get_path()
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 12c377384270..27c6fac09f49 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -12,12 +12,12 @@
 import numpy as np
 import torch
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from torch.distributed import PrefixStore, ProcessGroup
 
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
     from vllm.config.cache import CacheDType
     from vllm.inputs import ProcessorInputs, PromptType
@@ -134,6 +134,11 @@ class Platform:
 
     _global_graph_pool: Any | None = None
 
+    @property
+    def pass_key(self) -> str:
+        """Inductor config key for the PassManager custom pass"""
+        return "post_grad_custom_post_pass"
+
     @property
     def supported_dtypes(self) -> list[torch.dtype]:
         """Returns the supported dtypes for the current platform."""
@@ -177,6 +182,21 @@ def is_sleep_mode_available(self) -> bool:
         # all ROCm platforms for now.
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
+    @classmethod
+    def get_pass_manager_cls(cls) -> str:
+        """
+        Get the pass manager class for this platform.
+        It will be registered as a custom pass under the current_platform.pass_key.
+        """
+        return "vllm.compilation.pass_manager.PostGradPassManager"
+
+    @classmethod
+    def get_compile_backend(cls) -> str:
+        """
+        Get the custom compile backend for current platform.
+        """
+        return cls.simple_compile_backend
+
     @classmethod
     def device_id_to_physical_device_id(cls, device_id: int):
         # Treat empty device control env var as unset. This is a valid
@@ -206,9 +226,6 @@ def import_kernels(cls) -> None:
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        # Import AttentionBackendEnum here to avoid circular import.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.TORCH_SDPA
 
     @classmethod
@@ -222,6 +239,7 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse: bool,
+        attn_type: str | None = None,
     ) -> str:
         """Get the attention backend class of a device."""
         return ""
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d20dc9e6b067..ccf3446a3a6e 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -8,16 +8,14 @@
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
-else:
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -185,6 +183,9 @@ class RocmPlatform(Platform):
         "petit_nvfp4",
         "torchao",
     ]
+    # bitsandbytes not supported on gfx9 (warp size 64 limitation)
+    if not on_gfx9():
+        supported_quantization += ["bitsandbytes"]
 
     @classmethod
     def get_vit_attn_backend(
@@ -193,7 +194,6 @@ def get_vit_attn_backend(
         from importlib.util import find_spec
 
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if rocm_aiter_ops.is_mha_enabled():
             # Note: AITER FA is only supported for Qwen-VL models.
@@ -216,12 +216,20 @@ def get_attn_backend_cls(
         use_mla,
         has_sink,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
         from vllm._aiter_ops import rocm_aiter_ops
-        from vllm.attention.backends.registry import AttentionBackendEnum
 
         if use_sparse:
-            raise NotImplementedError("Sparse Attention is not supported on ROCm.")
+            if kv_cache_dtype.startswith("fp8"):
+                raise ValueError(
+                    "ROCMAiterMLASparseBackend doesn't support fp8 kv_cache_dtype."
+                )
+            assert block_size == 1, (
+                "Sparse MLA backend on ROCm only supports block size 1 for now."
+            )
+            logger.info_once("Using Sparse MLA backend on V1 engine.")
+            return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
 
         if use_mla:
             if selected_backend is None:
@@ -230,7 +238,6 @@ def get_attn_backend_cls(
                     if rocm_aiter_ops.is_mla_enabled() or block_size == 1
                     else AttentionBackendEnum.TRITON_MLA
                 )
-
             if selected_backend == AttentionBackendEnum.TRITON_MLA:
                 if block_size != 1:
                     logger.info_once("Using Triton MLA backend.")
@@ -242,6 +249,9 @@ def get_attn_backend_cls(
             if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
                 logger.info("Using AITER MLA backend.")
                 return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
+            if selected_backend == AttentionBackendEnum.ROCM_AITER_TRITON_MLA:
+                logger.info("Using AITER TRITON MLA backend.")
+                return AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path()
 
             raise ValueError(
                 f" The selected backend, {selected_backend.name},"
@@ -250,28 +260,66 @@ def get_attn_backend_cls(
 
         if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
             logger.info("Using FlexAttention backend.")
-            return "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend"
-        if (
-            rocm_aiter_ops.is_mha_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
-            logger.info("Using Aiter Flash Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_FA.get_path()
-        if (
-            rocm_aiter_ops.is_triton_unified_attn_enabled()
-        ) or selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
-            logger.info("Using Aiter Unified Attention backend.")
-            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
-        if (
-            envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
-            or selected_backend == AttentionBackendEnum.ROCM_ATTN
-        ):
-            # rocm specific backend, with aiter and/or
-            #   triton prefix-prefill
-            logger.info("Using Rocm Attention backend.")
+            return AttentionBackendEnum.FLEX_ATTENTION.get_path()
+
+        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_ATTN:
+            logger.info("Using Rocm Attention backend on V1 engine.")
             return AttentionBackendEnum.ROCM_ATTN.get_path()
-        # default case, using triton unified attention
-        logger.info("Using Triton Attention backend.")
-        return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
+            if on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+            else:
+                raise ValueError(
+                    f"The selected backend, {selected_backend.name}, "
+                    "is only supported on gfx9 architectures."
+                )
+
+        if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
+            logger.info("Using Aiter Unified Attention backend on V1 engine.")
+            return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+        # Handle automatic backend selection based on environment variables
+        if selected_backend is None:
+            # Priority 1: Check for AITER Unified Attention (must check before MHA)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
+                logger.info("Using Aiter Unified Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
+
+            # Priority 2: Check for AITER MHA (Flash Attention)
+            # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
+            if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Priority 3: Check for ROCM_ATTN (prefill-decode split)
+            if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+                logger.info("Using Rocm Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_ATTN.get_path()
+
+            # Priority 4: Check for AITER enabled without specific flags
+            # This defaults to AITER FA only if MHA is not explicitly disabled
+            if (
+                envs.VLLM_ROCM_USE_AITER
+                and on_gfx9()
+                and envs.VLLM_ROCM_USE_AITER_MHA is not False
+            ):
+                logger.info("Using Aiter Flash Attention backend on V1 engine.")
+                return AttentionBackendEnum.ROCM_AITER_FA.get_path()
+
+            # Default: Triton Unified Attention
+            logger.info("Using Triton Attention backend on V1 engine.")
+            return AttentionBackendEnum.TRITON_ATTN.get_path()
+
+        raise RuntimeError(
+            f"Attention backend {selected_backend.name} is not supported on "
+            "ROCm. Note that V0 attention backends have been removed."
+        )
 
     @classmethod
     def set_device(cls, device: torch.device) -> None:
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 4773fef6829d..cbc0a996f366 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -7,23 +7,26 @@
 import torch
 from tpu_info import device
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
+    from typing import TypeAlias
+
     from vllm.config import VllmConfig
     from vllm.config.cache import BlockSize
     from vllm.pooling_params import PoolingParams
+    from vllm.sampling_params import SamplingParams
+
+    ParamsType: TypeAlias = SamplingParams | PoolingParams
 else:
     BlockSize = None
     VllmConfig = None
     PoolingParams = None
-    AttentionBackendEnum = None
+    ParamsType = None
 
 logger = init_logger(__name__)
 
@@ -61,9 +64,8 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on TPU.")
         if selected_backend != AttentionBackendEnum.PALLAS:
@@ -185,10 +187,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
@@ -204,10 +205,12 @@ def get_device_communicator_cls(cls) -> str:
     def validate_request(
         cls,
         prompt: PromptType,
-        params: SamplingParams | PoolingParams,
+        params: ParamsType,
         processed_inputs: ProcessorInputs,
     ) -> None:
         """Raises if this request is unsupported on this platform"""
+        from vllm.sampling_params import SamplingParams, SamplingType
+
         if (
             isinstance(params, SamplingParams)
             and params.sampling_type == SamplingType.RANDOM_SEED
@@ -261,7 +264,9 @@ def check_max_model_len(cls, max_model_len: int) -> int:
 
 
 try:
-    from tpu_inference.platforms import TpuPlatform as TpuInferencePlatform
+    from tpu_inference.platforms import (
+        TpuPlatform as TpuInferencePlatform,
+    )
 
     TpuPlatform = TpuInferencePlatform  # type: ignore
     USE_TPU_INFERENCE = True
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index c629325f76a3..768714fb1672 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -8,17 +8,15 @@
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import DeviceCapability, Platform, PlatformEnum
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.registry import AttentionBackendEnum
     from vllm.config import VllmConfig
 else:
     VllmConfig = None
-    AttentionBackendEnum = None
 
 logger = init_logger(__name__)
 
@@ -51,6 +49,7 @@ def get_attn_backend_cls(
         use_mla: bool,
         has_sink: bool,
         use_sparse,
+        attn_type: str | None = None,
     ) -> str:
         from vllm.v1.attention.backends.utils import set_kv_cache_layout
 
@@ -60,8 +59,6 @@ def get_attn_backend_cls(
             "only NHD layout is supported by XPU attention kernels."
         )
 
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         if use_sparse:
             raise NotImplementedError("Sparse Attention is not supported on XPU.")
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
@@ -116,8 +113,6 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     def get_vit_attn_backend(
         cls, head_size: int, dtype: torch.dtype
     ) -> "AttentionBackendEnum":
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         return AttentionBackendEnum.FLASH_ATTN
 
     @classmethod
@@ -184,10 +179,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 "prefill and prefix caching to be disabled."
             )
             vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
             vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                vllm_config.model_config.max_model_len,
+                vllm_config.scheduler_config.DEFAULT_MAX_NUM_BATCHED_TOKENS,
             )
 
     @classmethod
@@ -252,10 +246,6 @@ def insert_blocks_to_device(
     ) -> None:
         """Copy blocks from src_cache to dst_cache on XPU."""
         _src_cache = src_cache[:, src_block_indices]
-        if _src_cache.shape[2:] != dst_cache.shape[2:]:
-            # To support TP_ratio, HOST KV might be initiated with HND
-            # while XPU device KV is with NHD
-            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)
 
     @classmethod
@@ -268,8 +258,4 @@ def swap_out_blocks_to_host(
     ) -> None:
         """Copy blocks from XPU to host (CPU)."""
         _src_cache = src_cache[:, src_block_indices]
-        if _src_cache.shape[2:] != dst_cache.shape[2:]:
-            # XPU device KV is with NHD while HOST KV
-            # might be initiated with HND for TP_ratio support
-            _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
         dst_cache[:, dst_block_indices] = _src_cache.cpu()
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 0d8988f27959..4c59d5364a76 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -17,6 +17,9 @@
 # Platform plugins group will be loaded in all processes when
 # `vllm.platforms.current_platform` is called and the value not initialized,
 PLATFORM_PLUGINS_GROUP = "vllm.platform_plugins"
+# Stat logger plugins group will be loaded in process0 only when serve vLLM with
+# async mode.
+STAT_LOGGER_PLUGINS_GROUP = "vllm.stat_logger_plugins"
 
 # make sure one process only loads plugins once
 plugins_loaded = False
diff --git a/vllm/plugins/io_processors/interface.py b/vllm/plugins/io_processors/interface.py
index e0488e48614d..d2dd8b1bdc1f 100644
--- a/vllm/plugins/io_processors/interface.py
+++ b/vllm/plugins/io_processors/interface.py
@@ -6,7 +6,7 @@
 from typing import Any, Generic, TypeVar
 
 from vllm.config import VllmConfig
-from vllm.entrypoints.openai.protocol import IOProcessorResponse
+from vllm.entrypoints.pooling.pooling.protocol import IOProcessorResponse
 from vllm.inputs.data import PromptType
 from vllm.outputs import PoolingRequestOutput
 from vllm.pooling_params import PoolingParams
diff --git a/vllm/plugins/lora_resolvers/README.md b/vllm/plugins/lora_resolvers/README.md
deleted file mode 100644
index 48f27dddea07..000000000000
--- a/vllm/plugins/lora_resolvers/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# LoRA Resolver Plugins
-
-This directory contains vLLM general plugins for dynamically discovering and loading LoRA adapters
-via the LoRAResolver plugin framework.
-
-Note that `VLLM_ALLOW_RUNTIME_LORA_UPDATING` must be set to true to allow LoRA resolver plugins
-to work, and `VLLM_PLUGINS` must be set to include the desired resolver plugins.
-
-## lora_filesystem_resolver
-
-This LoRA Resolver is installed with vLLM by default.
-To use, set `VLLM_PLUGIN_LORA_CACHE_DIR` to a local directory. When vLLM receives a request
-for a LoRA adapter `foobar` it doesn't currently recognize, it will look in that local directory
-for a subdirectory `foobar` containing a LoRA adapter. If such an adapter exists, it will
-load that adapter, and then service the request as normal. That adapter will then be available
-for future requests as normal.
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index 72a8320cc1bf..4a5caa7e27fc 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -57,6 +57,7 @@ class PoolingParams(
     ## Internal use only
     task: PoolingTask | None = None
     requires_token_ids: bool = False
+    skip_reading_prefix_cache: bool | None = None
     extra_kwargs: dict[str, Any] | None = None
     output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
 
@@ -93,6 +94,8 @@ def verify(
         # plugin task uses io_processor.parse_request to verify inputs,
         # skipping PoolingParams verify
         if self.task == "plugin":
+            if self.skip_reading_prefix_cache is None:
+                self.skip_reading_prefix_cache = True
             return
 
         # NOTE: Task validation needs to done against the model instance,
@@ -122,6 +125,15 @@ def _merge_default_parameters(
             if getattr(self, k, None) is None:
                 setattr(self, k, getattr(pooler_config, k))
 
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of all pooling may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            if self.task in ["token_embed", "token_classify"]:
+                self.skip_reading_prefix_cache = True
+            else:
+                self.skip_reading_prefix_cache = False
+
         self._verify_step_pooling(pooler_config, valid_parameters)
 
     def _verify_step_pooling(
@@ -207,6 +219,8 @@ def __repr__(self) -> str:
             f"step_tag_id={self.step_tag_id}, "
             f"returned_token_ids={self.returned_token_ids}, "
             f"requires_token_ids={self.requires_token_ids}, "
+            f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
+            f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
             f"extra_kwargs={self.extra_kwargs})"
         )
 
diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py
index 58c668953161..798c615221b9 100644
--- a/vllm/profiler/gpu_profiler.py
+++ b/vllm/profiler/gpu_profiler.py
@@ -1,37 +1,216 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from abc import ABC, abstractmethod
+from contextlib import nullcontext
+
+import torch
+from typing_extensions import override
+
+import vllm.envs as envs
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
 
-class CudaProfilerWrapper:
+class WorkerProfiler(ABC):
     def __init__(self) -> None:
-        self._profiler_running = False
-        # Note: lazy import to avoid dependency issues if CUDA is not available.
-        import torch.cuda.profiler as cuda_profiler
+        self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS
+        if self._delay_iters > 0:
+            logger.info_once(
+                "GPU profiling will start "
+                f"{self._delay_iters} steps after start_profile."
+            )
 
-        self._cuda_profiler = cuda_profiler
+        self._max_iters = envs.VLLM_PROFILER_MAX_ITERS
+        if self._max_iters > 0:
+            logger.info_once(
+                "GPU profiling will stop "
+                f"after {self._max_iters} worker steps, "
+                "or when stop_profile is received."
+            )
 
-    def start(self) -> None:
+        # Track when the profiler gets triggered by start_profile
+        self._active_iteration_count = 0
+        self._active = False
+
+        # Track when the profiler is actually running
+        self._profiling_for_iters = 0
+        self._running = False
+
+    @abstractmethod
+    def _start(self) -> None:
+        """Start the profiler."""
+        pass
+
+    @abstractmethod
+    def _stop(self) -> None:
+        """Stop the profiler."""
+        pass
+
+    def _call_start(self) -> None:
+        """Call _start with error handling but no safeguards."""
         try:
-            self._cuda_profiler.start()
-            self._profiler_running = True
-            logger.info_once("Started CUDA profiler")
+            self._start()
+            self._running = True  # Only mark as running if start succeeds
         except Exception as e:
-            logger.warning_once("Failed to start CUDA profiler: %s", e)
+            logger.warning("Failed to start profiler: %s", e)
+
+    def _call_stop(self) -> None:
+        """Call _stop with error handling but no safeguards."""
+        try:
+            self._stop()
+            logger.info("Profiler stopped successfully.")
+        except Exception as e:
+            logger.warning("Failed to stop profiler: %s", e)
+        self._running = False  # Always mark as not running, assume stop worked
+
+    def start(self) -> None:
+        """Attempt to start the profiler, accounting for delayed starts."""
+        if self._active:
+            logger.debug(
+                "start_profile received when profiler is already active. "
+                "Ignoring request."
+            )
+            return
+        self._active = True
+        if self._delay_iters == 0:
+            self._call_start()
+
+    def step(self) -> None:
+        """Update the profiler state at each worker step,
+        to handle delayed starts and max iteration limits."""
+        if not self._active:
+            return
+
+        self._active_iteration_count += 1
+
+        if (
+            not self._running
+            and self._delay_iters > 0
+            and self._active_iteration_count == self._delay_iters
+        ):
+            logger.info("Starting profiler after delay...")
+            self._call_start()
+
+        if self._running:
+            self._profiling_for_iters += 1
+
+        if (
+            self._max_iters > 0
+            and self._running
+            and self._profiling_for_iters > self._max_iters
+        ):
+            # Automatically stop the profiler after max iters
+            # will be marked as not running, but leave as active so that stop
+            # can clean up properly
+            logger.info("Max profiling iterations reached. Stopping profiler...")
+            self._call_stop()
+            return
 
     def stop(self) -> None:
-        if self._profiler_running:
-            try:
-                self._cuda_profiler.stop()
-                logger.info_once("Stopped CUDA profiler")
-            except Exception as e:
-                logger.warning_once("Failed to stop CUDA profiler: %s", e)
-            finally:
-                self._profiler_running = False
+        """Attempt to stop the profiler, accounting for overlapped calls."""
+        if not self._active:
+            logger.debug(
+                "stop_profile received when profiler is not active. Ignoring request."
+            )
+            return
+        self._active = False
+        self._active_iteration_count = 0
+        self._profiling_for_iters = 0
+
+        if self._running:
+            self._call_stop()
 
     def shutdown(self) -> None:
         """Ensure profiler is stopped when shutting down."""
-        self.stop()
+        logger.info_once("Shutting down profiler")
+        if self._running:
+            self.stop()
+
+    def annotate_context_manager(self, name: str):
+        """Return a context manager to annotate profiler traces."""
+        return nullcontext()
+
+
+class TorchProfilerWrapper(WorkerProfiler):
+    def __init__(self, worker_name: str, local_rank: int) -> None:
+        super().__init__()
+
+        self.local_rank = local_rank
+        torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+        if local_rank in (None, 0):
+            logger.info(
+                "Torch profiling enabled. Traces will be saved to: %s",
+                torch_profiler_trace_dir,
+            )
+            logger.debug(
+                "Profiler config: record_shapes=%s,"
+                "profile_memory=%s,with_stack=%s,with_flops=%s",
+                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+                envs.VLLM_TORCH_PROFILER_WITH_STACK,
+                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            )
+        self.profiler = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
+            profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
+            with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
+            with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                torch_profiler_trace_dir,
+                worker_name=worker_name,
+                use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
+            ),
+        )
+
+    @override
+    def _start(self) -> None:
+        self.profiler.start()
+
+    @override
+    def _stop(self) -> None:
+        self.profiler.stop()
+
+        if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL:
+            rank = self.local_rank
+            profiler_dir = envs.VLLM_TORCH_PROFILER_DIR
+            profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt"
+            sort_key = "self_cuda_time_total"
+            table = self.profiler.key_averages().table(sort_by=sort_key)
+
+            with open(profiler_out_file, "w") as f:
+                print(table, file=f)
+
+            # only print profiler results on rank 0
+            if rank == 0:
+                print(table)
+
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.profiler.record_function(name)
+
+
+class CudaProfilerWrapper(WorkerProfiler):
+    def __init__(self) -> None:
+        super().__init__()
+        # Note: lazy import to avoid dependency issues if CUDA is not available.
+        import torch.cuda.profiler as cuda_profiler
+
+        self._cuda_profiler = cuda_profiler
+
+    @override
+    def _start(self) -> None:
+        self._cuda_profiler.start()
+
+    @override
+    def _stop(self) -> None:
+        self._cuda_profiler.stop()
+
+    @override
+    def annotate_context_manager(self, name: str):
+        return torch.cuda.nvtx.range(name)
diff --git a/vllm/ray/lazy_utils.py b/vllm/ray/lazy_utils.py
index 64b5f51571a3..06c91cc3943a 100644
--- a/vllm/ray/lazy_utils.py
+++ b/vllm/ray/lazy_utils.py
@@ -10,6 +10,8 @@ def is_ray_initialized():
         return ray.is_initialized()
     except ImportError:
         return False
+    except AttributeError:
+        return False
 
 
 def is_in_ray_actor():
@@ -24,3 +26,5 @@ def is_in_ray_actor():
         )
     except ImportError:
         return False
+    except AttributeError:
+        return False
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index d26e4ffc9c16..4a04292be009 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -19,12 +19,12 @@
         DeltaMessage,
         ResponsesRequest,
     )
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
     ChatCompletionRequest = Any
     DeltaMessage = Any
     ResponsesRequest = Any
-    AnyTokenizer = Any
+    TokenizerLike = Any
 
 logger = init_logger(__name__)
 
@@ -37,7 +37,7 @@ class ReasoningParser:
     It is used to extract reasoning content from the model output.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         self.model_tokenizer = tokenizer
 
     @cached_property
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
index 026894773272..35084c0e7cc8 100644
--- a/vllm/reasoning/basic_parsers.py
+++ b/vllm/reasoning/basic_parsers.py
@@ -7,7 +7,7 @@
 
 from vllm.entrypoints.openai.protocol import DeltaMessage
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 if TYPE_CHECKING:
     from vllm.entrypoints.openai.protocol import (
@@ -43,7 +43,7 @@ def end_token(self) -> str:
         """The token that ends reasoning content."""
         raise NotImplementedError
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
 
         if not self.model_tokenizer:
diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py
index 30f5f2f88caf..138d1b4e6dac 100644
--- a/vllm/reasoning/minimax_m2_reasoning_parser.py
+++ b/vllm/reasoning/minimax_m2_reasoning_parser.py
@@ -11,7 +11,7 @@
 from vllm.logger import init_logger
 from vllm.reasoning.abs_reasoning_parsers import ReasoningParser
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 logger = init_logger(__name__)
 
@@ -37,7 +37,7 @@ class MiniMaxM2AppendThinkReasoningParser(ReasoningParser):
     Reasoning parser for MiniMax M2 model.
     """
 
-    def __init__(self, tokenizer: AnyTokenizer, *args, **kwargs):
+    def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
         self.end_token_id = self.vocab.get("</think>")
 
diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py
index af6d179bf6d0..b61e50c188f8 100644
--- a/vllm/reasoning/mistral_reasoning_parser.py
+++ b/vllm/reasoning/mistral_reasoning_parser.py
@@ -6,7 +6,7 @@
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser
 from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 
 logger = init_logger(__name__)
 
diff --git a/vllm/reasoning/olmo3_reasoning_parser.py b/vllm/reasoning/olmo3_reasoning_parser.py
index 7149f8c4123b..2742a24a2c3e 100644
--- a/vllm/reasoning/olmo3_reasoning_parser.py
+++ b/vllm/reasoning/olmo3_reasoning_parser.py
@@ -9,7 +9,7 @@
 import regex as re
 
 if TYPE_CHECKING:
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionRequest,
@@ -220,7 +220,7 @@ class Olmo3ReasoningParser(ReasoningParser):
           token is missing from generation.
     """
 
-    def __init__(self, tokenizer: "AnyTokenizer", *args, **kwargs):
+    def __init__(self, tokenizer: "TokenizerLike", *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
 
         self.think_start = r"<think>"
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 4b2a3bc4dbaa..453100f2e513 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -3,7 +3,6 @@
 """Sampling parameters for text generation."""
 
 import copy
-import warnings
 from dataclasses import field
 from enum import Enum, IntEnum
 from functools import cached_property
@@ -14,7 +13,8 @@
 
 from vllm.logger import init_logger
 from vllm.logits_process import LogitsProcessor
-from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
+from vllm.v1.serial_utils import PydanticMsgspecMixin
 
 logger = init_logger(__name__)
 
@@ -99,19 +99,6 @@ def all_non_structural_tag_constraints_none(self) -> bool:
         )
 
 
-@dataclass
-class GuidedDecodingParams(StructuredOutputsParams):
-    def __post_init__(self):
-        warnings.warn(
-            "GuidedDecodingParams is deprecated. This will be removed in "
-            "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-            "StructuredOutputsParams instead.",
-            DeprecationWarning,
-            stacklevel=2,
-        )
-        return super().__post_init__()
-
-
 class RequestOutputKind(Enum):
     # Return entire output so far in every RequestOutput
     CUMULATIVE = 0
@@ -122,6 +109,7 @@ class RequestOutputKind(Enum):
 
 
 class SamplingParams(
+    PydanticMsgspecMixin,
     msgspec.Struct,
     omit_defaults=True,  # type: ignore[call-arg]
     # required for @cached_property.
@@ -142,12 +130,6 @@ class SamplingParams(
         are generated and streamed cumulatively per request. To see all `n`
         outputs upon completion, use `output_kind=RequestOutputKind.FINAL_ONLY`
         in `SamplingParams`."""
-    best_of: int | None = None
-    """Number of output sequences that are generated from the prompt. From
-    these `best_of` sequences, the top `n` sequences are returned. `best_of`
-    must be greater than or equal to `n`. By default, `best_of` is set to `n`.
-    Warning, this is only supported in V0."""
-    _real_n: int | None = None
     presence_penalty: float = 0.0
     """Penalizes new tokens based on whether they appear in the generated text
     so far. Values > 0 encourage the model to use new tokens, while values < 0
@@ -202,6 +184,12 @@ class SamplingParams(
     prompt_logprobs: int | None = None
     """Number of log probabilities to return per prompt token.
     When set to -1, return all `vocab_size` log probabilities."""
+    flat_logprobs: bool = False
+    """Whether to return logprobs in flatten format (i.e. FlatLogprob)
+    for better performance.
+    NOTE: GC costs of FlatLogprobs is significantly smaller than
+    list[dict[int, Logprob]]. After enabled, PromptLogprobs and
+    SampleLogprobs would populated as FlatLogprobs."""
     # NOTE: This parameter is only exposed at the engine level for now.
     # It is not exposed in the OpenAI API server, as the OpenAI API does
     # not support returning only a list of token IDs.
@@ -232,8 +220,6 @@ class SamplingParams(
     # Fields used to construct logits processors
     structured_outputs: StructuredOutputsParams | None = None
     """Parameters for configuring structured outputs."""
-    guided_decoding: GuidedDecodingParams | None = None
-    """Deprecated alias for structured_outputs."""
     logit_bias: dict[int, float] | None = None
     """If provided, the engine will construct a logits processor that applies
     these logit biases."""
@@ -252,10 +238,11 @@ class SamplingParams(
     generated token can complete the sequence."""
     _bad_words_token_ids: list[list[int]] | None = None
 
+    skip_reading_prefix_cache: bool | None = None
+
     @staticmethod
     def from_optional(
         n: int | None = 1,
-        best_of: int | None = None,
         presence_penalty: float | None = 0.0,
         frequency_penalty: float | None = 0.0,
         repetition_penalty: float | None = 1.0,
@@ -280,7 +267,6 @@ def from_optional(
         truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
         output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
         structured_outputs: StructuredOutputsParams | None = None,
-        guided_decoding: GuidedDecodingParams | None = None,
         logit_bias: dict[int, float] | dict[str, float] | None = None,
         allowed_token_ids: list[int] | None = None,
         extra_args: dict[str, Any] | None = None,
@@ -292,20 +278,9 @@ def from_optional(
                 int(token): min(100.0, max(-100.0, bias))
                 for token, bias in logit_bias.items()
             }
-        if guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            structured_outputs = guided_decoding
-            guided_decoding = None
 
         return SamplingParams(
             n=1 if n is None else n,
-            best_of=best_of,
             presence_penalty=0.0 if presence_penalty is None else presence_penalty,
             frequency_penalty=0.0 if frequency_penalty is None else frequency_penalty,
             repetition_penalty=1.0
@@ -338,22 +313,6 @@ def from_optional(
         )
 
     def __post_init__(self) -> None:
-        # how we deal with `best_of`:
-        # if `best_of` is not set, we default to `n`;
-        # if `best_of` is set, we set `n` to `best_of`,
-        # and set `_real_n` to the original `n`.
-        # when we return the result, we will check
-        # if we need to return `n` or `_real_n` results
-        if self.best_of:
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}."
-                )
-            if not self._real_n:
-                self._real_n = self.n
-                self.n = self.best_of
-
         if 0 < self.temperature < _MAX_TEMP:
             logger.warning(
                 "temperature %s is less than %s, which may cause numerical "
@@ -401,34 +360,17 @@ def __post_init__(self) -> None:
         # eos_token_id is added to this by the engine
         self._all_stop_token_ids.update(self.stop_token_ids)
 
-        if self.guided_decoding is not None:
-            warnings.warn(
-                "guided_decoding is deprecated. This will be removed in "
-                "v0.12.0 or v1.0.0, which ever is soonest. Please use "
-                "structured_outputs instead.",
-                DeprecationWarning,
-                stacklevel=2,
-            )
-            self.structured_outputs = self.guided_decoding
-            self.guided_decoding = None
+        if self.skip_reading_prefix_cache is None:
+            # If prefix caching is enabled,
+            # the output of prompt logprobs may less than n_prompt_tokens,
+            # we need to skip reading cache at this request.
+            self.skip_reading_prefix_cache = self.prompt_logprobs is not None
 
     def _verify_args(self) -> None:
         if not isinstance(self.n, int):
             raise ValueError(f"n must be an int, but is of type {type(self.n)}")
         if self.n < 1:
             raise ValueError(f"n must be at least 1, got {self.n}.")
-        if self.best_of is not None:
-            if not isinstance(self.best_of, int):
-                raise ValueError(
-                    f"best_of must be an integer, got {type(self.best_of)}"
-                )
-            if self.best_of < 1:
-                raise ValueError(f"best_of must be at least 1, got {self.best_of}")
-            if self.best_of < self.n:
-                raise ValueError(
-                    f"best_of must be greater than or equal to n, "
-                    f"got n={self.n} and best_of={self.best_of}."
-                )
         if not -2.0 <= self.presence_penalty <= 2.0:
             raise ValueError(
                 f"presence_penalty must be in [-2, 2], got {self.presence_penalty}."
@@ -503,10 +445,6 @@ def _verify_args(self) -> None:
                 "stop strings are only supported when detokenize is True. "
                 "Set detokenize=True to use stop."
             )
-        if self.best_of != self._real_n and self.output_kind == (
-            RequestOutputKind.DELTA
-        ):
-            raise ValueError("best_of must equal n to use output_kind=DELTA")
 
     def _verify_greedy_sampling(self) -> None:
         if self.n > 1:
@@ -539,7 +477,7 @@ def update_from_generation_config(
                     eos_ids.update(self.stop_token_ids)
                     self.stop_token_ids = list(eos_ids)
 
-    def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
+    def update_from_tokenizer(self, tokenizer: TokenizerLike) -> None:
         if not self.bad_words:
             return
         self._bad_words_token_ids = []
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 6bcc94ad5c62..6d20ca9aac22 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -60,12 +60,17 @@ class IntermediateTensors:
     tensors: dict[str, torch.Tensor]
     kv_connector_output: KVConnectorOutput | None
 
-    def __init__(self, tensors):
+    def __init__(
+        self,
+        tensors: dict[str, torch.Tensor],
+        kv_connector_output: KVConnectorOutput | None = None,
+    ) -> None:
         # manually define this function, so that
         # Dynamo knows `IntermediateTensors()` comes from this file.
         # Otherwise, dataclass will generate this function by evaluating
         # a string, and we will lose the information about the source file.
         self.tensors = tensors
+        self.kv_connector_output = kv_connector_output
 
     def __getitem__(self, key: str | slice):
         if isinstance(key, str):
diff --git a/vllm/tokenizers/__init__.py b/vllm/tokenizers/__init__.py
new file mode 100644
index 000000000000..14f0148cf7ba
--- /dev/null
+++ b/vllm/tokenizers/__init__.py
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from .hf import HfTokenizer
+from .mistral import MistralTokenizer
+from .protocol import TokenizerLike
+from .registry import TokenizerRegistry, get_tokenizer
+
+__all__ = [
+    "TokenizerLike",
+    "HfTokenizer",
+    "MistralTokenizer",
+    "TokenizerRegistry",
+    "get_tokenizer",
+]
diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/tokenizers/detokenizer_utils.py
similarity index 97%
rename from vllm/transformers_utils/detokenizer_utils.py
rename to vllm/tokenizers/detokenizer_utils.py
index 560526bfd823..e586a5d46cb8 100644
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/tokenizers/detokenizer_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
-from .tokenizer import AnyTokenizer
+from vllm.tokenizers import TokenizerLike
 
 
 def _replace_none_with_empty(tokens: list[str | None]):
@@ -12,7 +12,7 @@ def _replace_none_with_empty(tokens: list[str | None]):
 
 
 def _convert_tokens_to_string_with_added_encoders(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     output_tokens: list[str],
     skip_special_tokens: bool,
     spaces_between_special_tokens: bool,
@@ -57,7 +57,7 @@ def _convert_tokens_to_string_with_added_encoders(
 
 
 def convert_prompt_ids_to_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     prompt_ids: list[int],
     skip_special_tokens: bool = False,
 ) -> tuple[list[str], int, int]:
@@ -81,7 +81,7 @@ def convert_prompt_ids_to_tokens(
 
 
 def convert_ids_list_to_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: list[int],
 ) -> list[str]:
     """Detokenize the input ids individually.
@@ -108,7 +108,7 @@ def convert_ids_list_to_tokens(
 # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
 # under Apache 2.0 license
 def detokenize_incrementally(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     all_input_ids: list[int],
     prev_tokens: list[str] | None,
     prefix_offset: int,
diff --git a/vllm/tokenizers/hf.py b/vllm/tokenizers/hf.py
new file mode 100644
index 000000000000..344507312038
--- /dev/null
+++ b/vllm/tokenizers/hf.py
@@ -0,0 +1,124 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import contextlib
+import copy
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from transformers import AutoTokenizer
+
+from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
+
+from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+
+def get_cached_tokenizer(
+    tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast",
+) -> TokenizerLike:
+    """
+    By default, transformers will recompute multiple tokenizer properties
+    each time they are called, leading to a significant slowdown.
+    This proxy caches these properties for faster access.
+    """
+    cached_tokenizer = copy.copy(tokenizer)
+
+    tokenizer_all_special_ids = tokenizer.all_special_ids
+    tokenizer_all_special_tokens = tokenizer.all_special_tokens
+    tokenizer_vocab = tokenizer.get_vocab()
+    tokenizer_len = len(tokenizer)
+
+    max_token_id = max(tokenizer_vocab.values())
+    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
+    # are added and included in the implementation of the vocab_size
+    # property, but not in get_vocab(); if there is an implementation
+    # of vocab size, we should take the greater value.
+    if hasattr(tokenizer, "vocab_size"):
+        with contextlib.suppress(NotImplementedError):
+            max_token_id = max(max_token_id, tokenizer.vocab_size)
+
+    class CachedTokenizer(tokenizer.__class__):  # type: ignore
+        @property
+        def all_special_ids(self) -> list[int]:
+            return tokenizer_all_special_ids
+
+        @property
+        def all_special_tokens(self) -> list[str]:
+            return tokenizer_all_special_tokens
+
+        @property
+        def max_token_id(self) -> int:
+            return max_token_id
+
+        def get_vocab(self) -> dict[str, int]:
+            return tokenizer_vocab
+
+        def __len__(self) -> int:
+            return tokenizer_len
+
+        def __reduce__(self):
+            return get_cached_tokenizer, (tokenizer,)
+
+    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
+
+    cached_tokenizer.__class__ = CachedTokenizer
+    return cached_tokenizer  # type: ignore
+
+
+@TokenizerRegistry.register("hf")
+class HfTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                path_or_repo_id,
+                *args,
+                trust_remote_code=trust_remote_code,
+                revision=revision,
+                cache_dir=download_dir,
+                **kwargs,
+            )
+        except ValueError as e:
+            # If the error pertains to the tokenizer class not existing or not
+            # currently being imported,
+            # suggest using the --trust-remote-code flag.
+            if not trust_remote_code and (
+                "does not exist or is not currently imported." in str(e)
+                or "requires you to execute the tokenizer file" in str(e)
+            ):
+                err_msg = (
+                    "Failed to load the tokenizer. If the tokenizer "
+                    "is a custom tokenizer not yet available in the "
+                    "HuggingFace transformers library, consider "
+                    "setting `trust_remote_code=True` in LLM or using "
+                    "the `--trust-remote-code` flag in the CLI."
+                )
+                raise RuntimeError(err_msg) from e
+            else:
+                raise e
+
+        # The special_tokens in tokenizer should also be
+        # controlled by do_lower_case in encoder_config
+        encoder_config = get_sentence_transformer_tokenizer_config(
+            path_or_repo_id, revision
+        )
+        if isinstance(encoder_config, dict) and encoder_config.get(
+            "do_lower_case", False
+        ):
+            special_tokens_map = {
+                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
+            }
+            tokenizer.add_special_tokens(special_tokens_map)
+
+        return get_cached_tokenizer(tokenizer)
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/tokenizers/mistral.py
similarity index 88%
rename from vllm/transformers_utils/tokenizers/mistral.py
rename to vllm/tokenizers/mistral.py
index 39198a1f3d81..7e6745004b01 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/tokenizers/mistral.py
@@ -1,16 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, cast
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer_base import TokenizerBase
+
+from .protocol import TokenizerLike
+from .registry import TokenizerRegistry
 
 if TYPE_CHECKING:
     from mistral_common.protocol.instruct.request import (
         ChatCompletionRequest as MistralChatCompletionRequest,
     )
     from mistral_common.tokens.tokenizers.tekken import Tekkenizer
+    from transformers import BatchEncoding
     from transformers.tokenization_mistral_common import (
         MistralCommonTokenizer as TransformersMistralTokenizer,
     )
@@ -163,8 +166,37 @@ def _tekken_token_to_id(tokenizer: "Tekkenizer", t: str | bytes) -> int:
         return tokenizer.unk_id
 
 
-class MistralTokenizer(TokenizerBase):
+@TokenizerRegistry.register("mistral")
+class MistralTokenizer(TokenizerLike):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "MistralTokenizer":
+        from mistral_common.protocol.instruct.validator import ValidationMode
+        from transformers.tokenization_mistral_common import (
+            MistralCommonTokenizer as TransformersMistralTokenizer,
+        )
+
+        tokenizer = TransformersMistralTokenizer.from_pretrained(
+            path_or_repo_id,
+            *args,
+            mode=ValidationMode.test,
+            cache_dir=download_dir,
+            revision="main" if revision is None else revision,
+            **kwargs,
+        )
+
+        return cls(tokenizer)
+
     def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
+        super().__init__()
+
         from mistral_common.protocol.instruct.validator import ValidationMode
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
@@ -210,22 +242,6 @@ def __init__(self, tokenizer: "TransformersMistralTokenizer") -> None:
         self._vocab = self.tokenizer._vocab
         self._max_token_id = self.vocab_size - 1
 
-    @classmethod
-    def from_pretrained(
-        cls, path_or_repo_id: str, *, revision: str | None = None
-    ) -> "MistralTokenizer":
-        from mistral_common.protocol.instruct.validator import ValidationMode
-        from transformers.tokenization_mistral_common import (
-            MistralCommonTokenizer as TransformersMistralTokenizer,
-        )
-
-        str_revision = "main" if revision is None else revision
-        return cls(
-            TransformersMistralTokenizer.from_pretrained(
-                path_or_repo_id, revision=str_revision, mode=ValidationMode.test
-            )
-        )
-
     def _get_special_token_ids(self) -> list[int]:
         from mistral_common.tokens.tokenizers.sentencepiece import (
             SentencePieceTokenizer,
@@ -254,10 +270,6 @@ def _get_special_tokens(self, all_special_ids: list[int]) -> list[str]:
 
     # the following attributes are set to fit vLLM's design and are used
     # by the structured output backends.
-    @property
-    def all_special_tokens_extended(self) -> list[str]:
-        return self.all_special_tokens
-
     @property
     def all_special_tokens(self) -> list[str]:
         return self._special_tokens
@@ -275,12 +287,8 @@ def eos_token_id(self) -> int:
         return self.tokenizer.eos_id
 
     @property
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    def pad_token(self) -> str:
-        return self.transformers_tokenizer.pad_token
+    def pad_token_id(self) -> int:
+        return self.tokenizer.pad_id
 
     @property
     def is_fast(self) -> bool:
@@ -296,29 +304,46 @@ def max_token_id(self) -> int:
 
     @property
     def truncation_side(self) -> str:
-        raise NotImplementedError()
+        return self.transformers_tokenizer.truncation_side
 
     def _is_special_token_id(self, token_id: int) -> bool:
         return token_id in self._special_token_ids_set
 
+    def __hash__(self) -> int:
+        return hash(id(self))
+
     def __len__(self) -> int:
         return self.vocab_size
 
     def __call__(
         self,
-        text: str | list[str] | list[int],
+        text: str | list[str],
         text_pair: str | None = None,
-        add_special_tokens: bool = False,
+        add_special_tokens: bool = True,
         truncation: bool = False,
         max_length: int | None = None,
-    ):
-        return self.transformers_tokenizer(
+    ) -> "BatchEncoding":
+        if text_pair is not None:
+            raise ValueError(
+                "`text_pair` is not supported by `MistralTokenizer.__call__`."
+            )
+
+        encoded = self.transformers_tokenizer(
             text=text,
             text_pair=text_pair,
             add_special_tokens=add_special_tokens,
             truncation=truncation,
             max_length=max_length,
         )
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, revert to only call self.transformers_tokenizer(...).
+        # Hack to fix wrongly added eos token, when fix will be supported the condition
+        # below will be False even before the revert is done.
+        if encoded["input_ids"] and encoded["input_ids"][-1] == self.eos_token_id:
+            encoded["input_ids"].pop(-1)
+            if attention_mask := encoded.get("attention_mask"):
+                attention_mask.pop(-1)
+        return encoded
 
     @property
     def vocab(self) -> list[str]:
@@ -331,27 +356,16 @@ def get_added_vocab(self) -> dict[str, int]:
         # Mistral tokenizers have no added vocabulary
         return {}
 
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        # Mistral Tokenizers should not add special tokens
-        return self.transformers_tokenizer.encode(
-            text, add_special_tokens=False, truncation=truncation, max_length=max_length
-        )
-
     def encode(
         self,
         text: str,
         truncation: bool | None = None,
         max_length: int | None = None,
-        add_special_tokens: bool | None = None,
+        add_special_tokens: bool = True,
     ) -> list[int]:
-        encoded = self.tokenizer.encode(
-            text, bos=add_special_tokens is not False, eos=False
-        )
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.encode(...).
+        encoded = self.tokenizer.encode(text, bos=add_special_tokens, eos=False)
 
         if truncation is not False and max_length is not None:
             return encoded[:max_length]
@@ -386,7 +400,9 @@ def apply_chat_template(
             return_dict=False,
         )
 
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        # TODO(juliendenize): once https://github.com/huggingface/transformers/pull/41962
+        # is in, directly call self.transformers_tokenizer.decode(...).
         if isinstance(ids, int):
             ids = [ids]
 
@@ -456,7 +472,7 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
     def convert_ids_to_tokens(
         self,
         ids: list[int],
-        skip_special_tokens: bool = True,
+        skip_special_tokens: bool = False,
     ) -> list[str]:
         from mistral_common.tokens.tokenizers.base import (
             SpecialTokenPolicy,
diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py
new file mode 100644
index 000000000000..6c807bd99878
--- /dev/null
+++ b/vllm/tokenizers/protocol.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Protocol
+
+if TYPE_CHECKING:
+    from transformers import BatchEncoding
+
+    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
+
+
+class TokenizerLike(Protocol):
+    @classmethod
+    def from_pretrained(
+        cls,
+        path_or_repo_id: str | Path,
+        *args,
+        trust_remote_code: bool = False,
+        revision: str | None = None,
+        download_dir: str | None = None,
+        **kwargs,
+    ) -> "TokenizerLike":
+        raise NotImplementedError
+
+    @property
+    def all_special_tokens(self) -> list[str]:
+        raise NotImplementedError
+
+    @property
+    def all_special_ids(self) -> list[int]:
+        raise NotImplementedError
+
+    @property
+    def bos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def eos_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def pad_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def is_fast(self) -> bool:
+        raise NotImplementedError
+
+    @property
+    def vocab_size(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def max_token_id(self) -> int:
+        raise NotImplementedError
+
+    @property
+    def truncation_side(self) -> str:
+        raise NotImplementedError
+
+    def __hash__(self) -> int:
+        return hash(id(self))
+
+    def __len__(self) -> int:
+        return self.vocab_size
+
+    def __call__(
+        self,
+        text: str | list[str],
+        text_pair: str | None = None,
+        add_special_tokens: bool = True,
+        truncation: bool = False,
+        max_length: int | None = None,
+    ) -> "BatchEncoding":
+        raise NotImplementedError
+
+    def get_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def get_added_vocab(self) -> dict[str, int]:
+        raise NotImplementedError
+
+    def encode(
+        self,
+        text: str,
+        truncation: bool | None = None,
+        max_length: int | None = None,
+        add_special_tokens: bool = True,
+    ) -> list[int]:
+        raise NotImplementedError
+
+    def apply_chat_template(
+        self,
+        messages: list["ChatCompletionMessageParam"],
+        tools: list[dict[str, Any]] | None = None,
+        **kwargs,
+    ) -> list[int]:
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        raise NotImplementedError
+
+    def decode(self, ids: list[int] | int, skip_special_tokens: bool = False) -> str:
+        raise NotImplementedError
+
+    def convert_ids_to_tokens(
+        self,
+        ids: list[int],
+        skip_special_tokens: bool = False,
+    ) -> list[str]:
+        raise NotImplementedError
diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py
new file mode 100644
index 000000000000..d5e789932161
--- /dev/null
+++ b/vllm/tokenizers/registry.py
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import importlib.util
+from collections.abc import Callable
+from pathlib import Path
+from typing import TypeVar, overload
+
+import huggingface_hub
+
+import vllm.envs as envs
+from vllm.logger import init_logger
+from vllm.transformers_utils.gguf_utils import get_gguf_file_path_from_hf
+from vllm.transformers_utils.repo_utils import list_filtered_repo_files
+from vllm.transformers_utils.utils import (
+    check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
+    split_remote_gguf,
+)
+from vllm.utils.import_utils import resolve_obj_by_qualname
+
+from .protocol import TokenizerLike
+
+logger = init_logger(__name__)
+
+_T = TypeVar("_T", bound=type[TokenizerLike])
+
+
+class TokenizerRegistry:
+    # Tokenizer name -> tokenizer_cls or (tokenizer module, tokenizer class)
+    REGISTRY: dict[str, type[TokenizerLike] | tuple[str, str]] = {}
+
+    # In-tree tokenizers
+    @staticmethod
+    @overload
+    def register(tokenizer_mode: str) -> Callable[[_T], _T]: ...
+
+    # OOT tokenizers
+    @staticmethod
+    @overload
+    def register(tokenizer_mode: str, module: str, class_name: str) -> None: ...
+
+    @staticmethod
+    def register(
+        tokenizer_mode: str,
+        module: str | None = None,
+        class_name: str | None = None,
+    ) -> Callable[[_T], _T] | None:
+        # In-tree tokenizers
+        if module is None or class_name is None:
+
+            def wrapper(tokenizer_cls: _T) -> _T:
+                assert tokenizer_mode not in TokenizerRegistry.REGISTRY
+                TokenizerRegistry.REGISTRY[tokenizer_mode] = tokenizer_cls
+
+                return tokenizer_cls
+
+            return wrapper
+
+        # OOT tokenizers
+        if tokenizer_mode in TokenizerRegistry.REGISTRY:
+            logger.warning(
+                "%s.%s is already registered for tokenizer_mode=%r. "
+                "It is overwritten by the new one.",
+                module,
+                class_name,
+                tokenizer_mode,
+            )
+
+        TokenizerRegistry.REGISTRY[tokenizer_mode] = (module, class_name)
+
+        return None
+
+    @staticmethod
+    def get_tokenizer(tokenizer_mode: str, *args, **kwargs) -> "TokenizerLike":
+        if tokenizer_mode not in TokenizerRegistry.REGISTRY:
+            raise ValueError(f"No tokenizer registered for {tokenizer_mode=!r}.")
+
+        item = TokenizerRegistry.REGISTRY[tokenizer_mode]
+        if isinstance(item, type):
+            return item.from_pretrained(*args, **kwargs)
+
+        module, class_name = item
+        logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
+
+        class_ = resolve_obj_by_qualname(f"{module}.{class_name}")
+        return class_.from_pretrained(*args, **kwargs)
+
+
+def get_tokenizer(
+    tokenizer_name: str | Path,
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    revision: str | None = None,
+    download_dir: str | None = None,
+    **kwargs,
+) -> TokenizerLike:
+    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
+    if envs.VLLM_USE_MODELSCOPE:
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        # avoid circular import
+        from vllm.model_executor.model_loader.weight_utils import get_lock
+
+        # Only set the tokenizer here, model will be downloaded on the workers.
+        if not Path(tokenizer_name).exists():
+            # Use file lock to prevent multiple processes from
+            # downloading the same file at the same time.
+            with get_lock(tokenizer_name, download_dir):
+                tokenizer_path = snapshot_download(
+                    model_id=str(tokenizer_name),
+                    cache_dir=download_dir,
+                    revision=revision,
+                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+                    # Ignore weights - we only need the tokenizer.
+                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+                )
+                tokenizer_name = tokenizer_path
+
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+
+        tokenizer_mode = "hf"
+        kwargs["use_fast"] = False
+
+    if "truncation_side" not in kwargs:
+        kwargs["truncation_side"] = "left"
+
+    # Separate model folder from file path for GGUF models
+    if is_gguf(tokenizer_name):
+        if check_gguf_file(tokenizer_name):
+            kwargs["gguf_file"] = Path(tokenizer_name).name
+            tokenizer_name = Path(tokenizer_name).parent
+        elif is_remote_gguf(tokenizer_name):
+            tokenizer_name, quant_type = split_remote_gguf(tokenizer_name)
+            # Get the HuggingFace Hub path for the GGUF file
+            gguf_file = get_gguf_file_path_from_hf(
+                tokenizer_name,
+                quant_type,
+                revision=revision,
+            )
+            kwargs["gguf_file"] = gguf_file
+
+    # Try to use official Mistral tokenizer if possible
+    if tokenizer_mode == "auto" and importlib.util.find_spec("mistral_common"):
+        allow_patterns = ["tekken.json", "tokenizer.model.v*"]
+        files_list = list_filtered_repo_files(
+            model_name_or_path=str(tokenizer_name),
+            allow_patterns=allow_patterns,
+            revision=revision,
+        )
+        if len(files_list) > 0:
+            tokenizer_mode = "mistral"
+
+    # Fallback to HF tokenizer
+    if tokenizer_mode == "auto":
+        tokenizer_mode = "hf"
+
+    tokenizer_args = (tokenizer_name, *args)
+    tokenizer_kwargs = dict(
+        trust_remote_code=trust_remote_code,
+        revision=revision,
+        download_dir=download_dir,
+        **kwargs,
+    )
+
+    if tokenizer_mode == "custom":
+        logger.warning_once(
+            "TokenizerRegistry now uses `tokenizer_mode` as the registry key "
+            "instead of `tokenizer_name`. "
+            "Please update the definition of `.from_pretrained` in "
+            "your custom tokenizer to accept `args=%s`, `kwargs=%s`. "
+            "Then, you can pass `tokenizer_mode=%r` instead of "
+            "`tokenizer_mode='custom'` when initializing vLLM.",
+            tokenizer_args,
+            str(tokenizer_kwargs),
+            tokenizer_mode,
+        )
+
+        tokenizer_mode = str(tokenizer_name)
+
+    tokenizer = TokenizerRegistry.get_tokenizer(
+        tokenizer_mode,
+        *tokenizer_args,
+        **tokenizer_kwargs,
+    )
+    if not tokenizer.is_fast:
+        logger.warning(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead."
+        )
+
+    return tokenizer
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 14cae2b168e1..1bb5791e1901 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,30 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import json
 import os
-import time
 from collections.abc import Callable
 from dataclasses import asdict
 from functools import cache, partial
+from importlib.metadata import version
 from pathlib import Path
-from typing import Any, Literal, TypeVar
+from typing import Any, Literal, TypeAlias
 
 import huggingface_hub
 from huggingface_hub import (
     get_safetensors_metadata,
-    hf_hub_download,
-    try_to_load_from_cache,
 )
-from huggingface_hub import list_repo_files as hf_list_repo_files
-from huggingface_hub.utils import (
-    EntryNotFoundError,
-    HfHubHTTPError,
-    LocalEntryNotFoundError,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-)
-from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
+from packaging.version import Version
+from transformers import GenerationConfig, PretrainedConfig
+from transformers.configuration_utils import ALLOWED_LAYER_TYPES
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.modeling_auto import (
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -35,10 +26,22 @@
 
 from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config_parser_base import ConfigParserBase
-from vllm.transformers_utils.utils import (
+
+from .config_parser_base import ConfigParserBase
+from .repo_utils import (
+    _get_hf_token,
+    file_or_path_exists,
+    get_hf_file_to_dict,
+    list_repo_files,
+    try_get_local_file,
+    with_retry,
+)
+from .utils import (
     check_gguf_file,
+    is_gguf,
+    is_remote_gguf,
     parse_safetensors_file_metadata,
+    split_remote_gguf,
 )
 
 if envs.VLLM_USE_MODELSCOPE:
@@ -51,21 +54,6 @@
 logger = init_logger(__name__)
 
 
-def _get_hf_token() -> str | None:
-    """
-    Get the HuggingFace token from environment variable.
-
-    Returns None if the token is not set, is an empty string,
-    or contains only whitespace.
-    This follows the same pattern as huggingface_hub library which
-    treats empty string tokens as None to avoid authentication errors.
-    """
-    token = os.getenv("HF_TOKEN")
-    if token and token.strip():
-        return token
-    return None
-
-
 class LazyConfigDict(dict):
     def __getitem__(self, key):
         if isinstance(value := super().__getitem__(key), type):
@@ -77,10 +65,12 @@ def __getitem__(self, key):
 
 
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
+    afmoe="AfmoeConfig",
     chatglm="ChatGLMConfig",
     deepseek_vl_v2="DeepseekVLV2Config",
-    deepseek_v32=DeepseekV3Config,
+    deepseek_v32="DeepseekV3Config",
     flex_olmo="FlexOlmoConfig",
+    hunyuan_vl="HunYuanVLConfig",
     kimi_linear="KimiLinearConfig",
     kimi_vl="KimiVLConfig",
     RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)
@@ -199,7 +189,19 @@ def parse(
 
         from vllm.transformers_utils.configs.mistral import adapt_config_dict
 
-        config = adapt_config_dict(config_dict)
+        # Get missing fields from HF config if available
+        try:
+            hf_config_dict, _ = PretrainedConfig.get_config_dict(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=_get_hf_token(),
+                **kwargs,
+            )
+        except OSError:  # Not found
+            hf_config_dict = {}
+
+        config = adapt_config_dict(config_dict, defaults=hf_config_dict)
 
         # Mistral configs may define sliding_window as list[int]. Convert it
         # to int and add the layer_types list[str] to make it HF compatible
@@ -287,152 +289,79 @@ def _wrapper(config_parser_cls):
     return _wrapper
 
 
-_R = TypeVar("_R")
+def set_default_rope_theta(config: PretrainedConfig, default_theta: float) -> None:
+    """Some models may have no rope_theta in their config but still use RoPE.
+    This function sets a default rope_theta if it's missing."""
+    if getattr(config, "rope_parameters", None) is None:
+        config.rope_parameters = {"rope_type": "default"}
+    if "rope_theta" not in config.rope_parameters:
+        config.rope_parameters["rope_theta"] = default_theta
 
 
-def with_retry(
-    func: Callable[[], _R],
-    log_msg: str,
-    max_retries: int = 2,
-    retry_delay: int = 2,
-) -> _R:
-    for attempt in range(max_retries):
-        try:
-            return func()
-        except Exception as e:
-            if attempt == max_retries - 1:
-                logger.error("%s: %s", log_msg, e)
-                raise
-            logger.error(
-                "%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
-            )
-            time.sleep(retry_delay)
-            retry_delay *= 2
-
-    raise AssertionError("Should not be reached")
-
-
-# @cache doesn't cache exceptions
-@cache
-def list_repo_files(
-    repo_id: str,
-    *,
-    revision: str | None = None,
-    repo_type: str | None = None,
-    token: str | bool | None = None,
-) -> list[str]:
-    def lookup_files() -> list[str]:
-        # directly list files if model is local
-        if (local_path := Path(repo_id)).exists():
-            return [
-                str(file.relative_to(local_path))
-                for file in local_path.rglob("*")
-                if file.is_file()
-            ]
-        # if model is remote, use hf_hub api to list files
-        try:
-            if envs.VLLM_USE_MODELSCOPE:
-                from vllm.transformers_utils.utils import modelscope_list_repo_files
-
-                return modelscope_list_repo_files(
-                    repo_id,
-                    revision=revision,
-                    token=os.getenv("MODELSCOPE_API_TOKEN", None),
-                )
-            return hf_list_repo_files(
-                repo_id, revision=revision, repo_type=repo_type, token=token
-            )
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            # Don't raise in offline mode,
-            # all we know is that we don't have this
-            # file cached.
-            return []
-
-    return with_retry(lookup_files, "Error retrieving file list")
-
-
-def file_exists(
-    repo_id: str,
-    file_name: str,
-    *,
-    repo_type: str | None = None,
-    revision: str | None = None,
-    token: str | bool | None = None,
-) -> bool:
-    file_list = list_repo_files(
-        repo_id, repo_type=repo_type, revision=revision, token=token
-    )
-    return file_name in file_list
-
-
-# In offline mode the result can be a false negative
-def file_or_path_exists(
-    model: str | Path, config_name: str, revision: str | None
-) -> bool:
-    if (local_path := Path(model)).exists():
-        return (local_path / config_name).is_file()
-
-    # Offline mode support: Check if config file is cached already
-    cached_filepath = try_to_load_from_cache(
-        repo_id=model, filename=config_name, revision=revision
-    )
-    if isinstance(cached_filepath, str):
-        # The config file exists in cache- we can continue trying to load
-        return True
-
-    # NB: file_exists will only check for the existence of the config file on
-    # hf_hub. This will fail in offline mode.
-
-    # Call HF to check if the file exists
-    return file_exists(
-        str(model), config_name, revision=revision, token=_get_hf_token()
-    )
-
-
-def patch_rope_scaling(config: PretrainedConfig) -> None:
+def patch_rope_parameters(config: PretrainedConfig) -> None:
     """Provide backwards compatibility for RoPE."""
-    text_config = getattr(config, "text_config", None)
-    if text_config is not None:
-        patch_rope_scaling(text_config)
-
-    rope_scaling = getattr(config, "rope_scaling", None)
-    if rope_scaling is not None:
-        patch_rope_scaling_dict(rope_scaling)
+    if Version(version("transformers")) < Version("5.0.0.dev0"):
+        # Transformers v4 installed, legacy config fields may be present
+        if (rope_scaling := getattr(config, "rope_scaling", None)) is not None:
+            config.rope_parameters = rope_scaling
+        if (rope_theta := getattr(config, "rope_theta", None)) is not None:
+            if not hasattr(config, "rope_parameters"):
+                config.rope_parameters = {"rope_type": "default"}
+            config.rope_parameters["rope_theta"] = rope_theta
+
+    # No RoPE parameters to patch
+    if not hasattr(config, "rope_parameters"):
+        return
+
+    # Add original_max_position_embeddings if present
+    if ompe := getattr(config, "original_max_position_embeddings", None):
+        config.rope_parameters["original_max_position_embeddings"] = ompe
+
+    # Handle nested rope_parameters in interleaved sliding attention models
+    if set(config.rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES):
+        for rope_parameters_layer_type in config.rope_parameters.values():
+            patch_rope_parameters_dict(rope_parameters_layer_type)
+    else:
+        patch_rope_parameters_dict(config.rope_parameters)
 
 
-def patch_rope_scaling_dict(rope_scaling: dict[str, Any]) -> None:
-    if "rope_type" in rope_scaling and "type" in rope_scaling:
-        rope_type = rope_scaling["rope_type"]
-        rope_type_legacy = rope_scaling["type"]
-        if rope_type != rope_type_legacy:
+def patch_rope_parameters_dict(rope_parameters: dict[str, Any]) -> None:
+    if "rope_type" in rope_parameters and "type" in rope_parameters:
+        rope_type = rope_parameters["rope_type"]
+        rope_type_legacy = rope_parameters["type"]
+        if (rope_type_legacy == "su" and rope_type == "longrope") or (
+            rope_type_legacy == "mrope" and rope_type == "default"
+        ):
+            pass  # No action needed
+        elif rope_type != rope_type_legacy:
             raise ValueError(
                 f"Found conflicts between 'rope_type={rope_type}' (modern "
                 f"field) and 'type={rope_type_legacy}' (legacy field). "
                 "You should only specify one of them."
             )
 
-    if "rope_type" not in rope_scaling and "type" in rope_scaling:
-        rope_scaling["rope_type"] = rope_scaling["type"]
+    if "rope_type" not in rope_parameters and "type" in rope_parameters:
+        rope_parameters["rope_type"] = rope_parameters["type"]
         logger.info("Replacing legacy 'type' key with 'rope_type'")
 
-    if "rope_type" not in rope_scaling:
-        raise ValueError("rope_scaling should have a 'rope_type' key")
+    if "rope_type" not in rope_parameters:
+        raise ValueError("rope_parameters should have a 'rope_type' key")
 
-    if rope_scaling["rope_type"] == "su":
-        rope_scaling["rope_type"] = "longrope"
+    if rope_parameters["rope_type"] == "su":
+        rope_parameters["rope_type"] = "longrope"
         logger.warning("Replacing legacy rope_type 'su' with 'longrope'")
-    elif rope_scaling["rope_type"] == "mrope":
-        assert "mrope_section" in rope_scaling
-        rope_scaling["rope_type"] = "default"
+    elif rope_parameters["rope_type"] == "mrope":
+        assert "mrope_section" in rope_parameters
+        rope_parameters["rope_type"] = "default"
         logger.warning("Replacing legacy rope_type 'mrope' with 'default'")
 
 
 def _uses_mrope(config: PretrainedConfig) -> bool:
-    rope_scaling = getattr(config, "rope_scaling", None)
-    if rope_scaling is None:
+    rope_parameters = getattr(config, "rope_parameters", None)
+    if rope_parameters is None:
         return False
 
-    return "mrope_section" in rope_scaling
+    return "mrope_section" in rope_parameters
 
 
 def uses_mrope(config: PretrainedConfig) -> bool:
@@ -457,6 +386,23 @@ def thinker_uses_mrope(config: PretrainedConfig) -> bool:
     return uses_mrope(thinker_text_config)
 
 
+def uses_xdrope_dim(config: PretrainedConfig) -> int:
+    """Detect if the model with this config uses XD-ROPE."""
+    xdrope_section = getattr(config, "xdrope_section", None)
+    if xdrope_section is not None and isinstance(xdrope_section, list):
+        return len(xdrope_section)
+    rope_scaling = getattr(config, "rope_scaling", None)
+    if rope_scaling is None:
+        return 0
+
+    if isinstance(rope_scaling, dict) and "xdrope_section" in rope_scaling:
+        xdrope_section = rope_scaling["xdrope_section"]
+        if xdrope_section is not None and isinstance(xdrope_section, list):
+            return len(xdrope_section)
+
+    return 0
+
+
 def is_encoder_decoder(config: PretrainedConfig) -> bool:
     """Detect if the model with this config is used as an encoder/decoder."""
 
@@ -472,8 +418,7 @@ def is_interleaved(config: PretrainedConfig) -> bool:
     """
     text_config = config.get_text_config()
     if layer_types := getattr(text_config, "layer_types", None):
-        interleaved_types = {"full_attention", "sliding_attention"}
-        return interleaved_types.issubset(layer_types)
+        return len(set(layer_types)) > 1
     return False
 
 
@@ -498,12 +443,12 @@ def _maybe_remap_hf_config_attrs(config: PretrainedConfig) -> PretrainedConfig:
 
 def maybe_override_with_speculators(
     model: str,
-    tokenizer: str,
+    tokenizer: str | None,
     trust_remote_code: bool,
     revision: str | None = None,
     vllm_speculative_config: dict[str, Any] | None = None,
     **kwargs,
-) -> tuple[str, str, dict[str, Any] | None]:
+) -> tuple[str, str | None, dict[str, Any] | None]:
     """
     Resolve model configuration when speculators are detected.
 
@@ -520,10 +465,12 @@ def maybe_override_with_speculators(
     Returns:
         Tuple of (resolved_model, resolved_tokenizer, speculative_config)
     """
-    is_gguf = check_gguf_file(model)
-    if is_gguf:
+    if check_gguf_file(model):
         kwargs["gguf_file"] = Path(model).name
         gguf_model_repo = Path(model).parent
+    elif is_remote_gguf(model):
+        repo_id, _ = split_remote_gguf(model)
+        gguf_model_repo = Path(repo_id)
     else:
         gguf_model_repo = None
     kwargs["local_files_only"] = huggingface_hub.constants.HF_HUB_OFFLINE
@@ -569,17 +516,44 @@ def get_config(
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
 
-    is_gguf = check_gguf_file(model)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(model).name
-        model = Path(model).parent
+    _is_gguf = is_gguf(model)
+    _is_remote_gguf = is_remote_gguf(model)
+    if _is_gguf:
+        if check_gguf_file(model):
+            # Local GGUF file
+            kwargs["gguf_file"] = Path(model).name
+            model = Path(model).parent
+        elif _is_remote_gguf:
+            # Remote GGUF - extract repo_id from repo_id:quant_type format
+            # The actual GGUF file will be downloaded later by GGUFModelLoader
+            # Keep model as repo_id:quant_type for download, but use repo_id for config
+            model, _ = split_remote_gguf(model)
 
     if config_format == "auto":
         try:
-            if is_gguf or file_or_path_exists(model, HF_CONFIG_NAME, revision=revision):
-                config_format = "hf"
-            elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
+            # First check for Mistral to avoid defaulting to
+            # Transformers implementation.
+            if file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision):
                 config_format = "mistral"
+            elif (_is_gguf and not _is_remote_gguf) or file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision
+            ):
+                config_format = "hf"
+            # Remote GGUF models must have config.json in repo,
+            # otherwise the config can't be parsed correctly.
+            # FIXME(Isotr0py): Support remote GGUF repos without config.json
+            elif _is_remote_gguf and not file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision
+            ):
+                err_msg = (
+                    "Could not find config.json for remote GGUF model repo. "
+                    "To load remote GGUF model through `<repo_id>:<quant_type>`, "
+                    "ensure your model has config.json (HF format) file. "
+                    "Otherwise please specify --hf-config-path <original_repo> "
+                    "in engine args to fetch config from unquantized hf model."
+                )
+                logger.error(err_msg)
+                raise ValueError(err_msg)
             else:
                 raise ValueError(
                     "Could not detect config format for no config file found. "
@@ -600,9 +574,6 @@ def get_config(
                 "'config.json'.\n"
                 "   - For Mistral models: ensure the presence of a "
                 "'params.json'.\n"
-                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
-                "   Loading GGUF from a remote repo directly is not yet "
-                "supported.\n"
             ).format(model=model)
 
             raise ValueError(error_message) from e
@@ -616,7 +587,7 @@ def get_config(
         **kwargs,
     )
     # Special architecture mapping check for GGUF models
-    if is_gguf:
+    if _is_gguf:
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
             raise RuntimeError(f"Can't get gguf config for {config.model_type}.")
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
@@ -679,7 +650,14 @@ def get_config(
         logger.debug("Overriding HF config with %s", hf_overrides_fn)
         config = hf_overrides_fn(config)
 
-    patch_rope_scaling(config)
+    # Exhaustively patch RoPE parameters everywhere they might be
+    patch_rope_parameters(config)
+    patch_rope_parameters(config.get_text_config())
+    SubConfigs: TypeAlias = dict[str, PretrainedConfig]
+    sub_configs: SubConfigs | None = getattr(config, "sub_configs", None)
+    if sub_configs:
+        for sub_config in sub_configs:
+            patch_rope_parameters(getattr(config, sub_config))
 
     if trust_remote_code:
         maybe_register_config_serialize_by_value()
@@ -687,72 +665,6 @@ def get_config(
     return config
 
 
-def try_get_local_file(
-    model: str | Path, file_name: str, revision: str | None = "main"
-) -> Path | None:
-    file_path = Path(model) / file_name
-    if file_path.is_file():
-        return file_path
-    else:
-        try:
-            cached_filepath = try_to_load_from_cache(
-                repo_id=model, filename=file_name, revision=revision
-            )
-            if isinstance(cached_filepath, str):
-                return Path(cached_filepath)
-        except ValueError:
-            ...
-    return None
-
-
-def get_hf_file_to_dict(
-    file_name: str, model: str | Path, revision: str | None = "main"
-):
-    """
-    Downloads a file from the Hugging Face Hub and returns
-    its contents as a dictionary.
-
-    Parameters:
-    - file_name (str): The name of the file to download.
-    - model (str): The name of the model on the Hugging Face Hub.
-    - revision (str): The specific version of the model.
-
-    Returns:
-    - config_dict (dict): A dictionary containing
-    the contents of the downloaded file.
-    """
-
-    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
-
-    if file_path is None:
-        try:
-            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
-        except huggingface_hub.errors.OfflineModeIsEnabled:
-            return None
-        except (
-            RepositoryNotFoundError,
-            RevisionNotFoundError,
-            EntryNotFoundError,
-            LocalEntryNotFoundError,
-        ) as e:
-            logger.debug("File or repository not found in hf_hub_download", e)
-            return None
-        except HfHubHTTPError as e:
-            logger.warning(
-                "Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
-                file_name,
-                exc_info=e,
-            )
-            return None
-        file_path = Path(hf_hub_file)
-
-    if file_path is not None and file_path.is_file():
-        with open(file_path) as file:
-            return json.load(file)
-
-    return None
-
-
 @cache
 def get_pooling_config(model: str, revision: str | None = "main") -> dict | None:
     """
@@ -769,6 +681,8 @@ def get_pooling_config(model: str, revision: str | None = "main") -> dict | None
         A dictionary containing the pooling type and whether
             normalization is used, or None if no pooling configuration is found.
     """
+    if is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)
 
     modules_file_name = "modules.json"
 
@@ -988,6 +902,8 @@ def get_hf_image_processor_config(
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
         model = Path(model).parent
+    elif is_remote_gguf(model):
+        model, _ = split_remote_gguf(model)
     return get_image_processor_config(
         model, token=hf_token, revision=revision, **kwargs
     )
@@ -1163,41 +1079,3 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
         )
 
     return max_position_embeddings
-
-
-def get_model_path(model: str | Path, revision: str | None = None):
-    if os.path.exists(model):
-        return model
-    assert huggingface_hub.constants.HF_HUB_OFFLINE
-    common_kwargs = {
-        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
-        "revision": revision,
-    }
-
-    if envs.VLLM_USE_MODELSCOPE:
-        from modelscope.hub.snapshot_download import snapshot_download
-
-        return snapshot_download(model_id=model, **common_kwargs)
-
-    from huggingface_hub import snapshot_download
-
-    return snapshot_download(repo_id=model, **common_kwargs)
-
-
-def get_hf_file_bytes(
-    file_name: str, model: str | Path, revision: str | None = "main"
-) -> bytes | None:
-    """Get file contents from HuggingFace repository as bytes."""
-    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
-
-    if file_path is None:
-        hf_hub_file = hf_hub_download(
-            model, file_name, revision=revision, token=_get_hf_token()
-        )
-        file_path = Path(hf_hub_file)
-
-    if file_path is not None and file_path.is_file():
-        with open(file_path, "rb") as file:
-            return file.read()
-
-    return None
diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py
index ac612b255143..109f2b698651 100644
--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -5,8 +5,14 @@
 
 - There is no configuration file defined by HF Hub or Transformers library.
 - There is a need to override the existing config to support vLLM.
+- The HF model_type isn't recognized by the Transformers library but can
+  be mapped to an existing Transformers config, such as
+  deepseek-ai/DeepSeek-V3.2-Exp.
 """
 
+from transformers import DeepseekV3Config
+
+from vllm.transformers_utils.configs.afmoe import AfmoeConfig
 from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
 from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
@@ -17,6 +23,11 @@
 # `FalconConfig` class from the official HuggingFace transformers library.
 from vllm.transformers_utils.configs.falcon import RWConfig
 from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig
+from vllm.transformers_utils.configs.hunyuan_vl import (
+    HunYuanVLConfig,
+    HunYuanVLTextConfig,
+    HunYuanVLVisionConfig,
+)
 from vllm.transformers_utils.configs.jais import JAISConfig
 from vllm.transformers_utils.configs.kimi_linear import KimiLinearConfig
 from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
@@ -40,11 +51,16 @@
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
 
 __all__ = [
+    "AfmoeConfig",
     "ChatGLMConfig",
     "DeepseekVLV2Config",
+    "DeepseekV3Config",
     "DotsOCRConfig",
     "EAGLEConfig",
     "FlexOlmoConfig",
+    "HunYuanVLConfig",
+    "HunYuanVLTextConfig",
+    "HunYuanVLVisionConfig",
     "RWConfig",
     "JAISConfig",
     "Lfm2MoeConfig",
diff --git a/vllm/transformers_utils/configs/afmoe.py b/vllm/transformers_utils/configs/afmoe.py
new file mode 100644
index 000000000000..47fee9882f9f
--- /dev/null
+++ b/vllm/transformers_utils/configs/afmoe.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class AfmoeConfig(PretrainedConfig):
+    model_type = "afmoe"
+
+    def __init__(
+        self,
+        vocab_size: int = 200_192,
+        hidden_size: int = 2048,
+        intermediate_size: int = 6144,
+        moe_intermediate_size: int = 1408,
+        num_hidden_layers: int = 32,
+        num_dense_layers: int = 1,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int | None = None,
+        head_dim: int = 128,
+        hidden_act: str = "silu",
+        max_position_embeddings: int = 131072,
+        initializer_range: float = 0.02,
+        rms_norm_eps: float = 1e-5,
+        use_cache: bool = True,
+        tie_word_embeddings: bool = False,
+        rope_parameters: dict | None = None,
+        rope_scaling: dict | None = None,
+        num_experts: int = 64,
+        num_experts_per_tok: int = 6,
+        num_shared_experts: int = 2,
+        num_expert_groups: int = 1,
+        num_limited_groups: int = 1,
+        score_func: str = "sigmoid",
+        route_norm: bool = True,
+        route_scale: float = 1.0,
+        global_attn_every_n_layers: int = 4,
+        sliding_window: int = 2048,
+        layer_types: list[str] | None = None,
+        attention_dropout: float = 0.0,
+        mup_enabled: bool = False,
+        n_group: int = 1,
+        topk_group: int = 1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_dense_layers = num_dense_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads or num_attention_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
+        self.rope_scaling = rope_scaling
+
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_shared_experts = num_shared_experts
+        self.num_expert_groups = num_expert_groups
+        self.num_limited_groups = num_limited_groups
+        self.score_func = score_func
+        self.route_norm = route_norm
+        self.route_scale = route_scale
+
+        self.global_attn_every_n_layers = global_attn_every_n_layers
+        self.sliding_window = sliding_window
+        self.layer_types = layer_types
+        self.attention_dropout = attention_dropout
+
+        self.mup_enabled = mup_enabled
+        self.n_group = n_group
+        self.topk_group = topk_group
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+__all__ = ["AfmoeConfig"]
diff --git a/vllm/transformers_utils/configs/arctic.py b/vllm/transformers_utils/configs/arctic.py
index 1707e15285c8..ba4b1a8f701f 100644
--- a/vllm/transformers_utils/configs/arctic.py
+++ b/vllm/transformers_utils/configs/arctic.py
@@ -85,8 +85,15 @@ class ArcticConfig(PretrainedConfig):
             The id of the "end-of-sequence" token.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
         sliding_window (`int`, *optional*):
             Sliding window attention window size. If not specified, will default to `4096`.
         attention_dropout (`float`, *optional*, defaults to 0.0):
@@ -132,7 +139,7 @@ def __init__(
         bos_token_id=1,
         eos_token_id=2,
         tie_word_embeddings=False,
-        rope_theta=1e6,
+        rope_parameters: dict[str, Any] | None = None,
         sliding_window=None,
         attention_dropout=0.0,
         num_experts_per_tok=1,
@@ -165,7 +172,10 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 1e6)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.attention_dropout = attention_dropout
 
         self.num_experts_per_tok = num_experts_per_tok
diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py
index 1f2f4d446288..c343dc0999a8 100644
--- a/vllm/transformers_utils/configs/flex_olmo.py
+++ b/vllm/transformers_utils/configs/flex_olmo.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -25,8 +26,7 @@ def __init__(
         bos_token_id=None,
         eos_token_id=100257,
         tie_word_embeddings=False,
-        rope_theta=500000.0,
-        rope_scaling=None,
+        rope_parameters: dict[str, Any] | None = None,
         attention_bias=False,
         attention_dropout=0.0,
         num_experts_per_tok=5,
@@ -62,8 +62,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.num_experts_per_tok = num_experts_per_tok
@@ -73,5 +78,5 @@ def __init__(
         self.norm_topk_prob = norm_topk_prob
         # Validate the correctness of rotary position embeddings parameters
         # BC: if there is a 'type' field, move it to 'rope_type'.
-        if self.rope_scaling is not None and "type" in self.rope_scaling:
-            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        if self.rope_parameters is not None and "type" in self.rope_parameters:
+            self.rope_parameters["rope_type"] = self.rope_parameters["type"]
diff --git a/vllm/transformers_utils/configs/hunyuan_vl.py b/vllm/transformers_utils/configs/hunyuan_vl.py
new file mode 100644
index 000000000000..a826ed9b5155
--- /dev/null
+++ b/vllm/transformers_utils/configs/hunyuan_vl.py
@@ -0,0 +1,322 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/configuration_hunyuan_vl.py
+
+from transformers import PretrainedConfig
+
+
+class HunYuanVLVisionConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_act="gelu",
+        hidden_size=1152,
+        intermediate_size=4304,
+        interpolate_mode="bilinear",
+        rms_norm_eps=1e-05,
+        learnable_mlp_pooling_size=0,
+        num_attention_heads=16,
+        num_key_value_heads=None,
+        num_channels=3,
+        num_hidden_layers=27,
+        out_hidden_size=4096,
+        patch_size=16,
+        remove_prenorm=True,
+        spatial_merge_size=2,
+        temporal_patch_size=1,
+        resize_resolution=2048,
+        img_max_token_num=4096,
+        max_image_size=2048,
+        video_max_image_size=768,
+        video_min_image_size=256,
+        min_image_size=512,
+        anyres_vit_max_image_size=2048,
+        max_vit_seq_len=16384,
+        text_hidden_size=3072,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.interpolate_mode = interpolate_mode
+        self.learnable_mlp_pooling_size = learnable_mlp_pooling_size
+        self.num_attention_heads = num_attention_heads
+        if not num_key_value_heads:
+            self.num_key_value_heads = num_attention_heads
+        else:
+            self.num_key_value_heads = num_key_value_heads
+        self.num_channels = num_channels
+        self.num_hidden_layers = num_hidden_layers
+        self.out_hidden_size = out_hidden_size
+        self.patch_size = patch_size
+        self.remove_prenorm = remove_prenorm
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.rms_norm_eps = rms_norm_eps
+
+        self.resize_resolution = resize_resolution
+        self.img_max_token_num = img_max_token_num
+        self.max_image_size = max_image_size
+        self.min_image_size = min_image_size
+        self.video_max_image_size = video_max_image_size
+        self.video_min_image_size = video_min_image_size
+        self.anyres_vit_max_image_size = anyres_vit_max_image_size
+        self.max_vit_seq_len = max_vit_seq_len
+        self.text_hidden_size = text_hidden_size
+
+
+class HunYuanVLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanVLTextConfig`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-7B-Instruct [tencent/Hunyuan-7B-Instruct](https://huggingface.co/tencent/Hunyuan-7B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanVLTextConfig`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """  # noqa: E501
+
+    model_type = "hunyuan_vl_text"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and "
+                f"`factor` or `type` and `alpha`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                "`rope_scaling`'s type field must be one of ['linear', 'dynamic'], "
+                f"got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError(
+                "`rope_scaling`'s factor or alpha field must be have one, "
+                "got both of none"
+            )
+        if rope_scaling_factor is not None and (
+            not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s factor field must be a float > 1.0, "
+                f"got {rope_scaling_factor}"
+            )
+        if rope_scaling_alpha is not None and (
+            not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0
+        ):
+            raise ValueError(
+                "`rope_scaling`'s alpha field must be a float > 1.0, "
+                f"got {rope_scaling_alpha}"
+            )
+
+
+class HunYuanVLConfig(PretrainedConfig):
+    model_type = "hunyuan_vl"
+    sub_configs = {
+        "vision_config": HunYuanVLVisionConfig,
+        "text_config": HunYuanVLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        im_start_id=120118,
+        im_end_id=120119,
+        image_token_id=120120,
+        im_newline_id=120121,
+        video_start_id=120122,
+        video_end_id=120123,
+        **kwargs,
+    ):
+        # We need to init super() here so that it does not reset values
+        # that are in text config to the BaseClass defaults. The Base
+        # config has many text related defaults and not all defaults are
+        # same as for `HunYuanVLTextConfig`.
+        super().__init__(**kwargs)
+
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+
+        self.image_token_id = image_token_id
+        self.im_start_id = im_start_id
+        self.im_end_id = im_end_id
+        self.im_newline_id = im_newline_id
+        self.video_start_id = video_start_id
+        self.video_end_id = video_end_id
+
+        self.vision_config.text_hidden_size = self.text_config.hidden_size
+
+        # Attention implementation to use. It sets it recursively on sub-configs
+        # so we call it again in the end.
+        self._attn_implementation = kwargs.pop("attn_implementation", None)
+
+    def __setattr__(self, key, value):
+        if (
+            (text_config := super().__getattribute__("__dict__").get("text_config"))
+            is not None
+            and key not in ["dtype", "_attn_implementation_internal"]
+            and key in text_config.__dict__
+        ):
+            setattr(text_config, key, value)
+        else:
+            super().__setattr__(key, value)
+
+    def __getattribute__(self, key):
+        if "text_config" in super().__getattribute__("__dict__") and key not in [
+            "_name_or_path",
+            "model_type",
+            "dtype",
+            "_attn_implementation_internal",
+        ]:
+            text_config = super().__getattribute__("text_config")
+            if key in text_config.__dict__:
+                return getattr(text_config, key)
+
+        return super().__getattribute__(key)
diff --git a/vllm/transformers_utils/configs/kimi_linear.py b/vllm/transformers_utils/configs/kimi_linear.py
index 65ddf48c5249..14894816801d 100644
--- a/vllm/transformers_utils/configs/kimi_linear.py
+++ b/vllm/transformers_utils/configs/kimi_linear.py
@@ -29,8 +29,7 @@ def __init__(
         pad_token_id=0,
         bos_token_id=1,
         eos_token_id=2,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         tie_word_embeddings=False,
         moe_intermediate_size: int | None = None,
         moe_renormalize: bool = True,
@@ -73,8 +72,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
 
         self.q_lora_rank = q_lora_rank
         self.kv_lora_rank = kv_lora_rank
diff --git a/vllm/transformers_utils/configs/lfm2_moe.py b/vllm/transformers_utils/configs/lfm2_moe.py
index 37c038e12db8..b399a03c030f 100644
--- a/vllm/transformers_utils/configs/lfm2_moe.py
+++ b/vllm/transformers_utils/configs/lfm2_moe.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
 
 from transformers.configuration_utils import PretrainedConfig
 
@@ -35,8 +36,8 @@ class Lfm2MoeConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 1000000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings.
         max_position_embeddings (`int`, *optional*, defaults to 128000):
             The maximum sequence length that this model might ever be used with.
         use_cache (`bool`, *optional*, defaults to `True`):
@@ -100,7 +101,7 @@ def __init__(
         bos_token_id: int = 1,
         eos_token_id: int = 2,
         tie_word_embeddings: bool = True,
-        rope_theta: float = 1000000.0,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embeddings: int = 128_000,
         use_cache: bool = True,
         norm_eps: float = 0.00001,
@@ -121,7 +122,10 @@ def __init__(
         self.hidden_size = hidden_size
         self.intermediate_size = intermediate_size
         self.num_hidden_layers = num_hidden_layers
-        self.rope_theta = rope_theta
+        rope_theta = kwargs.pop("rope_theta", 1000000.0)
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        self.rope_parameters = rope_parameters
         self.max_position_embeddings = max_position_embeddings
         self.use_cache = use_cache
         self.norm_eps = norm_eps
diff --git a/vllm/transformers_utils/configs/midashenglm.py b/vllm/transformers_utils/configs/midashenglm.py
index e49bd26b2b00..f1bbd057103e 100644
--- a/vllm/transformers_utils/configs/midashenglm.py
+++ b/vllm/transformers_utils/configs/midashenglm.py
@@ -98,6 +98,6 @@ def __init__(
             if text_config
             else Qwen2_5OmniTextConfig()
         )
-        self.text_config.rope_scaling = None  # uses_mrope is false
+        self.text_config.rope_parameters = None  # uses_mrope is false
         self.audio_token_id = audio_token_id
         super().__init__(**kwargs)
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index c6f04febe37e..966737aad086 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -9,14 +9,18 @@
 logger = init_logger(__name__)
 
 
-def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig:
-    config_dict.update(kwargs)
+def adapt_config_dict(
+    config_dict: dict[str, Any],
+    defaults: dict[str, Any],
+) -> PretrainedConfig:
     config_dict = _remap_general_mistral_args(config_dict)
 
     if bool(config_dict.get("quantization")):
         config_dict = _remap_mistral_quantization_args(config_dict)
 
-    if bool(config_dict.get("moe")):
+    if config_dict.get("model_type") == "mamba":
+        config_dict["architectures"] = ["Mamba2ForCausalLM"]
+    elif bool(config_dict.get("moe")):
         config_dict["architectures"] = ["MixtralForCausalLM"]
     else:
         config_dict["architectures"] = ["MistralForCausalLM"]
@@ -52,6 +56,9 @@ def adapt_config_dict(config_dict: dict[str, Any], **kwargs) -> PretrainedConfig
     if is_audio:
         config_dict = _remap_mistral_audio_args(config_dict)
 
+    for k, v in defaults.items():
+        config_dict.setdefault(k, v)
+
     config = PretrainedConfig.from_dict(config_dict)
 
     logger.debug("Initialized config %s", config)
@@ -86,13 +93,17 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
         "apply_scale": "apply_yarn_scaling",
     }
     yarn_config = config.get("yarn") or {}
-    config["rope_scaling"] = {
+    config["rope_parameters"] = {
         "rope_type": "yarn",
         "mscale_all_dim": 1,
     }
+
+    if rope_theta := config.pop("rope_theta", None):
+        config["rope_parameters"]["rope_theta"] = rope_theta
+
     for old_name, new_name in yarn_config_map.items():
         if old_name in yarn_config:
-            config["rope_scaling"][new_name] = yarn_config.pop(old_name)
+            config["rope_parameters"][new_name] = yarn_config.pop(old_name)
 
     assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
 
@@ -114,7 +125,7 @@ def _remap_general_mistral_args(config: dict) -> dict:
         "model_type": ("model_type", "transformer"),
         "hidden_act": ("activation", "silu"),
         "tie_word_embeddings": ("tied_embeddings", False),
-        "max_seq_len": ("max_seq_len", 128_000),
+        "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
         "max_position_embeddings": ("max_position_embeddings", 128_000),
     }
 
diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py
index 60eed549561f..d112c71d7d20 100644
--- a/vllm/transformers_utils/configs/nemotron.py
+++ b/vllm/transformers_utils/configs/nemotron.py
@@ -88,8 +88,8 @@ class NemotronConfig(PretrainedConfig):
             End of stream token id.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
+        rope_parameters (`dict`, *optional*):
+            The parameters of the RoPE embeddings.
         partial_rotary_factor (`float`, *optional*, defaults to 0.5):
             Percentage of the query and keys which will have rotary embedding.
         attention_bias (`bool`, *optional*, defaults to `False`):
@@ -132,8 +132,7 @@ def __init__(
         bos_token_id=2,
         eos_token_id=3,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         partial_rotary_factor=0.5,
         attention_bias=False,
         attention_dropout=0.0,
@@ -160,8 +159,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.norm_eps = norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         # for backward compatibility
         partial_rotary_factor = (
             kwargs.get("rope_percent")
@@ -169,7 +173,7 @@ def __init__(
             or partial_rotary_factor
         )
         self.partial_rotary_factor = partial_rotary_factor
-        self._rope_scaling_validation()
+        self._rope_parameters_validation()
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.mlp_bias = mlp_bias
@@ -182,31 +186,29 @@ def __init__(
             **kwargs,
         )
 
-    def _rope_scaling_validation(self):
+    def _rope_parameters_validation(self):
         """
-        Validate the `rope_scaling` configuration.
+        Validate the `rope_parameters` configuration.
         """
-        if self.rope_scaling is None:
+        if self.rope_parameters is None:
             return
 
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, "
-                f"`type` and `factor`, got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                "`rope_scaling`'s type field must be one of ['linear', "
-                f"'dynamic'], got {rope_scaling_type}"
-            )
-        if (
-            rope_scaling_factor is None
-            or not isinstance(rope_scaling_factor, float)
-            or rope_scaling_factor <= 1.0
-        ):
+        rope_type: str | None = self.rope_parameters.get("rope_type", None)
+        factor: float | None = self.rope_parameters.get("factor", None)
+
+        if rope_type not in {"default", "linear", "dynamic"}:
             raise ValueError(
-                "`rope_scaling`'s factor field must be a float > 1, got "
-                f"{rope_scaling_factor}"
+                "`rope_type` must be one of ['default', 'linear', 'dynamic'], "
+                f"got {rope_type}"
             )
+        if rope_type != "default":
+            if factor is None:
+                raise ValueError(
+                    "If `rope_type` is not 'default', `rope_parameters` "
+                    "must include a `factor` field. Got `None`."
+                )
+            if not isinstance(factor, float) or factor <= 1.0:
+                raise ValueError(
+                    "`rope_parameters`'s factor field must be a float > 1, got "
+                    f"{factor}"
+                )
diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py
index f5a9a7cd36bd..c4691b661af3 100644
--- a/vllm/transformers_utils/configs/olmo3.py
+++ b/vllm/transformers_utils/configs/olmo3.py
@@ -24,8 +24,7 @@ def __init__(
         bos_token_id=None,
         eos_token_id=50279,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         attention_bias=False,
         attention_dropout=0.0,
         rms_norm_eps=1e-5,
@@ -63,8 +62,13 @@ def __init__(
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
 
diff --git a/vllm/transformers_utils/configs/qwen3_next.py b/vllm/transformers_utils/configs/qwen3_next.py
index 21750bde2f87..fd36b49245f5 100644
--- a/vllm/transformers_utils/configs/qwen3_next.py
+++ b/vllm/transformers_utils/configs/qwen3_next.py
@@ -17,7 +17,6 @@
 """Qwen3-Next model configuration"""
 
 from transformers.configuration_utils import PretrainedConfig, layer_type_validation
-from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 
 logger = logging.get_logger(__name__)
@@ -66,13 +65,12 @@ class Qwen3NextConfig(PretrainedConfig):
             relevant if `config.is_decoder=True`.
         tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
+        rope_parameters (`dict`, *optional*):
             Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
             and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
             accordingly.
             Expected contents:
+                `rope_theta` (`float`): The base period of the RoPE embeddings.
                 `rope_type` (`str`):
                     The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                     'llama3'], with 'default' being the original RoPE implementation.
@@ -199,8 +197,7 @@ def __init__(
         rms_norm_eps=1e-6,
         use_cache=True,
         tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
+        rope_parameters=None,
         partial_rotary_factor=0.25,
         attention_bias=False,
         attention_dropout=0.0,
@@ -236,13 +233,17 @@ def __init__(
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 10000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.partial_rotary_factor = partial_rotary_factor
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
-        rope_config_validation(self)
 
         self.layer_types = layer_types
         if self.layer_types is None:
diff --git a/vllm/transformers_utils/configs/step3_vl.py b/vllm/transformers_utils/configs/step3_vl.py
index 637b82d88e26..0ee650a70451 100644
--- a/vllm/transformers_utils/configs/step3_vl.py
+++ b/vllm/transformers_utils/configs/step3_vl.py
@@ -52,8 +52,7 @@ def __init__(
         moe_intermediate_size: int = 5120,
         moe_num_experts: int = 48,
         moe_top_k: int = 3,
-        rope_theta: float = 500000,
-        rope_scaling: dict[str, Any] | None = None,
+        rope_parameters: dict[str, Any] | None = None,
         max_position_embedding: int = 65536,
         share_expert_dim: int = 5120,
         share_q_dim: int = 2048,
@@ -130,8 +129,13 @@ def __init__(
         self.moe_intermediate_size = moe_intermediate_size
         self.moe_num_experts = moe_num_experts
         self.moe_top_k = moe_top_k
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
+        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"}
+        rope_theta = kwargs.pop("rope_theta", 500000.0)
+        if "rope_theta" not in rope_parameters:
+            rope_parameters["rope_theta"] = rope_theta
+        self.rope_parameters = rope_parameters
         self.max_position_embedding = max_position_embedding
         self.share_expert_dim = share_expert_dim
         self.share_q_dim = share_q_dim
diff --git a/vllm/transformers_utils/gguf_utils.py b/vllm/transformers_utils/gguf_utils.py
new file mode 100644
index 000000000000..cb1fc2d092e0
--- /dev/null
+++ b/vllm/transformers_utils/gguf_utils.py
@@ -0,0 +1,209 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""GGUF utility functions."""
+
+from pathlib import Path
+
+import gguf
+from gguf.constants import Keys, VisionProjectorType
+from transformers import Gemma3Config, PretrainedConfig, SiglipVisionConfig
+
+from vllm.logger import init_logger
+
+from .repo_utils import list_filtered_repo_files
+
+logger = init_logger(__name__)
+
+
+def detect_gguf_multimodal(model: str) -> Path | None:
+    """Check if GGUF model has multimodal projector file.
+
+    Args:
+        model: Model path string
+
+    Returns:
+        Path to mmproj file if found, None otherwise
+    """
+    if not model.endswith(".gguf"):
+        return None
+
+    try:
+        model_path = Path(model)
+        if not model_path.is_file():
+            return None
+
+        model_dir = model_path.parent
+        mmproj_patterns = ["mmproj.gguf", "mmproj-*.gguf", "*mmproj*.gguf"]
+        for pattern in mmproj_patterns:
+            mmproj_files = list(model_dir.glob(pattern))
+            if mmproj_files:
+                return mmproj_files[0]
+        return None
+    except Exception:
+        return None
+
+
+def extract_vision_config_from_gguf(mmproj_path: str) -> "SiglipVisionConfig | None":
+    """Extract vision config parameters from mmproj.gguf metadata.
+
+    Reads vision encoder configuration from GGUF metadata fields using
+    standardized GGUF constants. Automatically detects the projector type
+    (e.g., gemma3, llama4) and applies model-specific parameters accordingly.
+
+    The function extracts standard CLIP vision parameters from GGUF metadata
+    and applies projector-type-specific customizations. For unknown projector
+    types, it uses safe defaults from SiglipVisionConfig.
+
+    Args:
+        mmproj_path: Path to mmproj.gguf file (str or Path)
+
+    Returns:
+        SiglipVisionConfig if extraction succeeds, None if any required
+        field is missing from the GGUF metadata
+
+    Raises:
+        Exception: Exceptions from GGUF reading (file not found, corrupted
+            file, etc.) propagate directly from gguf.GGUFReader
+    """
+    reader = gguf.GGUFReader(str(mmproj_path))
+
+    # Detect projector type to apply model-specific parameters
+    projector_type = None
+    projector_type_field = reader.get_field(Keys.Clip.PROJECTOR_TYPE)
+    if projector_type_field:
+        try:
+            projector_type = bytes(projector_type_field.parts[-1]).decode("utf-8")
+        except (AttributeError, UnicodeDecodeError) as e:
+            logger.warning("Failed to decode projector type from GGUF: %s", e)
+
+    # Map GGUF field constants to SiglipVisionConfig parameters.
+    # Uses official GGUF constants from gguf-py for standardization.
+    # Format: {gguf_constant: (param_name, dtype)}
+    VISION_CONFIG_FIELDS = {
+        Keys.ClipVision.EMBEDDING_LENGTH: ("hidden_size", int),
+        Keys.ClipVision.FEED_FORWARD_LENGTH: ("intermediate_size", int),
+        Keys.ClipVision.BLOCK_COUNT: ("num_hidden_layers", int),
+        Keys.ClipVision.Attention.HEAD_COUNT: ("num_attention_heads", int),
+        Keys.ClipVision.IMAGE_SIZE: ("image_size", int),
+        Keys.ClipVision.PATCH_SIZE: ("patch_size", int),
+        Keys.ClipVision.Attention.LAYERNORM_EPS: ("layer_norm_eps", float),
+    }
+
+    # Extract and validate all required fields
+    config_params = {}
+    for gguf_key, (param_name, dtype) in VISION_CONFIG_FIELDS.items():
+        field = reader.get_field(gguf_key)
+        if field is None:
+            logger.warning(
+                "Missing required vision config field '%s' in mmproj.gguf",
+                gguf_key,
+            )
+            return None
+        # Extract scalar value from GGUF field and convert to target type
+        config_params[param_name] = dtype(field.parts[-1])
+
+    # Apply model-specific parameters based on projector type
+    if projector_type == VisionProjectorType.GEMMA3:
+        # Gemma3 doesn't use the vision pooling head (multihead attention)
+        # This is a vLLM-specific parameter used in SiglipVisionTransformer
+        config_params["vision_use_head"] = False
+        logger.info("Detected Gemma3 projector, disabling vision pooling head")
+    # Add other projector-type-specific customizations here as needed
+    # elif projector_type == VisionProjectorType.LLAMA4:
+    #     config_params["vision_use_head"] = ...
+
+    # Create config with extracted parameters
+    # Note: num_channels and attention_dropout use SiglipVisionConfig defaults
+    # (3 and 0.0 respectively) which are correct for all models
+    config = SiglipVisionConfig(**config_params)
+
+    if projector_type:
+        logger.info(
+            "Extracted vision config from mmproj.gguf (projector_type: %s)",
+            projector_type,
+        )
+    else:
+        logger.info("Extracted vision config from mmproj.gguf metadata")
+
+    return config
+
+
+def maybe_patch_hf_config_from_gguf(
+    model: str,
+    hf_config: PretrainedConfig,
+) -> PretrainedConfig:
+    """Patch HF config for GGUF models.
+
+    Applies GGUF-specific patches to HuggingFace config:
+    1. For multimodal models: patches architecture and vision config
+    2. For all GGUF models: overrides vocab_size from embedding tensor
+
+    This ensures compatibility with GGUF models that have extended
+    vocabularies (e.g., Unsloth) where the GGUF file contains more
+    tokens than the HuggingFace tokenizer config specifies.
+
+    Args:
+        model: Model path string
+        hf_config: HuggingFace config to patch in-place
+
+    Returns:
+        Updated HuggingFace config
+    """
+    # Patch multimodal config if mmproj.gguf exists
+    mmproj_path = detect_gguf_multimodal(model)
+    if mmproj_path is not None:
+        vision_config = extract_vision_config_from_gguf(str(mmproj_path))
+
+        # Create HF config for Gemma3 multimodal
+        text_config = hf_config.get_text_config()
+        is_gemma3 = hf_config.model_type in ("gemma3", "gemma3_text")
+        if vision_config is not None and is_gemma3:
+            new_hf_config = Gemma3Config.from_text_vision_configs(
+                text_config=text_config,
+                vision_config=vision_config,
+                architectures=["Gemma3ForConditionalGeneration"],
+            )
+            hf_config = new_hf_config
+
+    return hf_config
+
+
+def get_gguf_file_path_from_hf(
+    repo_id: str | Path,
+    quant_type: str,
+    revision: str | None = None,
+) -> str:
+    """Get the GGUF file path from HuggingFace Hub based on repo_id and quant_type.
+
+    Args:
+        repo_id: The HuggingFace repository ID (e.g., "Qwen/Qwen3-0.6B")
+        quant_type: The quantization type (e.g., "Q4_K_M", "F16")
+        revision: Optional revision/branch name
+
+    Returns:
+        The path to the GGUF file on HuggingFace Hub (e.g., "filename.gguf"),
+    """
+    repo_id = str(repo_id)
+    gguf_patterns = [
+        f"*-{quant_type}.gguf",
+        f"*-{quant_type}-*.gguf",
+        f"*/*-{quant_type}.gguf",
+        f"*/*-{quant_type}-*.gguf",
+    ]
+    matching_files = list_filtered_repo_files(
+        repo_id,
+        allow_patterns=gguf_patterns,
+        revision=revision,
+    )
+
+    if len(matching_files) == 0:
+        raise ValueError(
+            "Could not find GGUF file for repo %s with quantization %s.",
+            repo_id,
+            quant_type,
+        )
+
+    # Sort to ensure consistent ordering (prefer non-sharded files)
+    matching_files.sort(key=lambda x: (x.count("-"), x))
+    gguf_filename = matching_files[0]
+    return gguf_filename
diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py
index b3469c1b18f2..63cdf6337034 100644
--- a/vllm/transformers_utils/processor.py
+++ b/vllm/transformers_utils/processor.py
@@ -18,7 +18,7 @@
 from transformers.video_processing_utils import BaseVideoProcessor
 from typing_extensions import TypeVar
 
-from vllm.transformers_utils.utils import convert_model_repo_to_path
+from vllm.transformers_utils.utils import convert_model_repo_to_path, is_gguf
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 
 if TYPE_CHECKING:
@@ -236,9 +236,20 @@ def cached_processor_from_config(
     processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
     **kwargs: Any,
 ) -> _P:
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
+            "For multimodal GGUF models, the original tokenizer "
+            "should be used to correctly load processor."
+        )
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
+    else:
+        model = model_config.model
+        revision = model_config.revision
+
     return cached_get_processor_without_dynamic_kwargs(
-        model_config.model,
-        revision=model_config.revision,
+        model,
+        revision=revision,
         trust_remote_code=model_config.trust_remote_code,
         processor_cls=processor_cls,  # type: ignore[arg-type]
         **_merge_mm_kwargs(model_config, processor_cls, **kwargs),
@@ -339,9 +350,19 @@ def cached_image_processor_from_config(
     model_config: "ModelConfig",
     **kwargs: Any,
 ):
+    if is_gguf(model_config.model):
+        assert not is_gguf(model_config.tokenizer), (
+            "For multimodal GGUF models, the original tokenizer "
+            "should be used to correctly load image processor."
+        )
+        model = model_config.tokenizer
+        revision = model_config.tokenizer_revision
+    else:
+        model = model_config.model
+        revision = model_config.revision
     return cached_get_image_processor(
-        model_config.model,
-        revision=model_config.revision,
+        model,
+        revision=revision,
         trust_remote_code=model_config.trust_remote_code,
         **_merge_mm_kwargs(model_config, AutoImageProcessor, **kwargs),
     )
diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py
index 76b6d3dc9c99..b49fdbe9ce77 100644
--- a/vllm/transformers_utils/processors/__init__.py
+++ b/vllm/transformers_utils/processors/__init__.py
@@ -9,7 +9,15 @@
 """
 
 from vllm.transformers_utils.processors.deepseek_vl2 import DeepseekVLV2Processor
+from vllm.transformers_utils.processors.hunyuan_vl import HunYuanVLProcessor
+from vllm.transformers_utils.processors.hunyuan_vl_image import HunYuanVLImageProcessor
 from vllm.transformers_utils.processors.ovis import OvisProcessor
 from vllm.transformers_utils.processors.ovis2_5 import Ovis2_5Processor
 
-__all__ = ["DeepseekVLV2Processor", "OvisProcessor", "Ovis2_5Processor"]
+__all__ = [
+    "DeepseekVLV2Processor",
+    "HunYuanVLProcessor",
+    "HunYuanVLImageProcessor",
+    "OvisProcessor",
+    "Ovis2_5Processor",
+]
diff --git a/vllm/transformers_utils/processors/hunyuan_vl.py b/vllm/transformers_utils/processors/hunyuan_vl.py
new file mode 100644
index 000000000000..615a8bff8591
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/processing_hunyuan_vl.py
+
+import numpy as np
+import torch
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.video_utils import VideoInput
+
+
+class HunYuanVLProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"  # ("AutoTokenizer", None)
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        video_processor=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        # TODO Fix the init
+        self.tokenizer = tokenizer
+        self.image_token_id = 120120  # self.tokenizer.image_token_id
+        self.image_token = self.tokenizer.convert_ids_to_tokens(self.image_token_id)
+        self.im_start_token_id = 120118  # self.tokenizer.im_start_id
+        self.im_start_token = self.tokenizer.convert_ids_to_tokens(
+            self.im_start_token_id
+        )
+        self.im_end_token_id = 120119  # self.tokenizer.im_end_id
+        self.im_end_token = self.tokenizer.convert_ids_to_tokens(self.im_end_token_id)
+        self.placeholder_token = self.tokenizer.convert_ids_to_tokens(
+            self.tokenizer.vocab_size - 1
+        )
+        self.pad_id = 120002  # self.tokenizer.pad_token_id
+
+        super().__init__(
+            image_processor, tokenizer, video_processor, chat_template=chat_template
+        )
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput
+        | PreTokenizedInput
+        | list[TextInput]
+        | list[PreTokenizedInput] = None,
+        videos: VideoInput = None,
+        **kwargs,
+    ) -> BatchFeature:
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images)
+            image_grid_thw = image_inputs["image_grid_thw"]
+
+        if not isinstance(text, list):
+            text = [text]
+
+        text = text.copy()  # below lines change text in-place
+
+        image_tokens_cumsum = [0]
+        if images is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    grid_h, grid_w = image_grid_thw[index][-2:]
+                    patch_h = grid_h // self.image_processor.merge_size
+                    patch_w = grid_w // self.image_processor.merge_size
+                    num_image_tokens = patch_h * (patch_w + 1) + 2
+                    image_tokens_cumsum.append(
+                        image_tokens_cumsum[-1] + num_image_tokens
+                    )
+                    # text[i] = text[i].replace(self.image_token, self.im_start_token + self.placeholder_token * num_image_tokens + self.im_end_token, 1) # noqa: E501
+                    text[i] = text[i].replace(
+                        self.image_token, self.placeholder_token * num_image_tokens, 1
+                    )
+                    index += 1
+                text[i] = text[i].replace(self.placeholder_token, self.image_token)
+                # text[i] = self.tokenizer.bos_token + text[i]
+
+        text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs)
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
+
+        input_ids = text_inputs["input_ids"]
+        position_ids = torch.arange(len(input_ids[0]))
+        position_ids_w = torch.arange(len(input_ids[0]))
+        position_ids_h = torch.arange(len(input_ids[0]))
+        position_ids_t = torch.arange(len(input_ids[0]))
+
+        if images is not None:
+            image_token_pos_indices = torch.where(input_ids[0] == self.image_token_id)[
+                0
+            ]
+            for i in range(len(image_grid_thw)):
+                grid_h, grid_w = image_grid_thw[i][-2:]
+                patch_h = grid_h // self.image_processor.merge_size
+                patch_w = grid_w // self.image_processor.merge_size
+                start_pos = image_token_pos_indices[image_tokens_cumsum[i]].item() + 1
+                replace_num = (patch_w + 1) * patch_h
+                position_ids_w[start_pos : start_pos + replace_num] = torch.tensor(
+                    list(range(patch_w + 1)) * patch_h, dtype=torch.int64
+                )
+                patch_h_list = []
+                for h in range(patch_h):
+                    patch_h_list += [h] * (patch_w + 1)
+                position_ids_h[start_pos : start_pos + replace_num] = torch.tensor(
+                    patch_h_list, dtype=torch.int64
+                )
+                position_ids_t[start_pos : start_pos + replace_num] = 0
+
+        position_ids = torch.stack(
+            [position_ids, position_ids_w, position_ids_h, position_ids_t]
+        ).unsqueeze(0)
+        text_inputs["position_ids"] = position_ids
+
+        attention_mask = input_ids.ne(self.pad_id)
+        text_inputs["attention_mask"] = attention_mask
+        text_inputs["imgs_pos"] = [self.get_imgs_pos(input_ids)]
+        # image_inputs["imgs"] = [[image_inputs["pixel_values"]]]
+
+        return_tensors = kwargs.pop("return_tensors", None)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_image_text_to_text(
+        self,
+        generated_outputs,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        assert 0
+
+    def apply_chat_template(self, *args, **kwargs):
+        token_ids = self.tokenizer.apply_chat_template(*args, **kwargs)
+        return token_ids
+
+    def get_imgs_pos(self, doc_ids):
+        doc_ids = np.array(doc_ids, dtype=np.int64)
+        img_begin_index = np.where(doc_ids == self.im_start_token_id)[0]
+        img_end_index = np.where(doc_ids == self.im_end_token_id)[0]
+        imgs_pos = np.concatenate(
+            (
+                np.reshape(img_begin_index + 1, (-1, 1)),
+                np.reshape(img_end_index, (-1, 1)),
+            ),
+            axis=-1,
+        ).tolist()
+        return imgs_pos
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+def split_image_into_patch_blocks(
+    pixel_values: torch.Tensor,  # shape: [batch_size, 3, H, W]
+    patch_size: int = 16,  # e.g. 16
+    adaptor_patch_div: int = 4,  # e.g. 4 --> each patch_size is cut into 4x4 small regions, i.e. patch_size // 4 # noqa: E501
+) -> torch.Tensor:
+    """
+    Split the input image tensor (supporting batch) into large patches of size `patch_size`,
+    and then further divide each large patch into smaller regions of size
+    (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div).
+    Each small region is extracted as a tensor of shape [3, patch_size, patch_size].
+    The final output contains all such small region tensors.
+
+    Args:
+        pixel_values: Input image tensor of shape [batch_size, 3, H, W].
+        patch_size: Size of the large patch, e.g., 16.
+        adaptor_patch_div: Each large patch is divided into
+                          (patch_size // adaptor_patch_div) x (patch_size // adaptor_patch_div)
+                          smaller regions.
+
+    Returns:
+        patches: A tensor of shape [N, 3, patch_size, patch_size],
+                 where N = batch_size * (H // patch_size) * (W // patch_size) * (patch_size // adaptor_patch_div)^2.
+                 Each element in the batch corresponds to one small image region.
+    """  # noqa: E501
+    batch_size, channels, height, width = pixel_values.shape
+    assert channels == 3, "Pixel values must have 3 channels in dim=1"
+    assert height % patch_size == 0 and width % patch_size == 0, (
+        "H and W must be divisible by patch_size"
+    )
+
+    patch_height_num = height // patch_size
+    patch_width_num = width // patch_size
+
+    # Reshape to [B, 3, ph, ps, pw, ps]
+    img = pixel_values.reshape(
+        batch_size, 3, patch_height_num, patch_size, patch_width_num, patch_size
+    )
+
+    # Further split each psxps patch into (ps//aps)x(ps//aps) small regions
+    img = img.reshape(
+        batch_size,
+        3,
+        patch_height_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+        patch_width_num,
+        patch_size // adaptor_patch_div,  # ps // aps
+        adaptor_patch_div,
+    )
+
+    # Permute to group the small regions: [B, ph, pw, ps//aps, ps//aps, 3, aps, aps]
+    img = img.permute(0, 2, 5, 3, 6, 1, 4, 7)
+
+    # Reshape into [B * ph * pw * (ps//aps)^2, 3, patch_size, patch_size]
+    patches = img.reshape(-1, 3, patch_size, patch_size)
+
+    return patches
+
+
+AutoProcessor.register("HunYuanVLProcessor", HunYuanVLProcessor)
diff --git a/vllm/transformers_utils/processors/hunyuan_vl_image.py b/vllm/transformers_utils/processors/hunyuan_vl_image.py
new file mode 100644
index 000000000000..0b10ae249dbb
--- /dev/null
+++ b/vllm/transformers_utils/processors/hunyuan_vl_image.py
@@ -0,0 +1,477 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# adapted from https://github.com/ManaEstras/transformers/blob/v4.57.1.hyvl/src/transformers/models/hunyuan_vl/image_processing_hunyuan_vl.py
+"""Image processor class for HunYuanVL."""
+
+# isort conflicts with ruff for transformers imports
+# isort: skip_file
+import math
+
+import numpy as np
+import torchvision.transforms as transforms
+from transformers import AutoImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    make_flat_list_of_images,
+    make_list_of_images,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, logging
+from transformers.video_utils import VideoInput, make_batched_videos
+
+logger = logging.get_logger(__name__)
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 16,
+    min_pixels: int = 512 * 512,
+    max_pixels: int = 2048 * 2048,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            "absolute aspect ratio must be smaller than 200, got "
+            f"{max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = max(factor, math.floor(height / beta / factor) * factor)
+        w_bar = max(factor, math.floor(width / beta / factor) * factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class HunYuanVLImageProcessor(BaseImageProcessor):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: int | float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if size is not None and (
+            "shortest_edge" not in size or "longest_edge" not in size
+        ):
+            raise ValueError(
+                "size must contain 'shortest_edge' and 'longest_edge' keys."
+            )
+        else:
+            size = {"shortest_edge": 512 * 512, "longest_edge": 2048 * 2048}
+        # backward compatibility: override size with min_pixels and max_pixels
+        # if they are provided.
+        if min_pixels is not None:
+            size["shortest_edge"] = min_pixels
+        if max_pixels is not None:
+            size["longest_edge"] = max_pixels
+        self.min_pixels = size["shortest_edge"]
+        self.max_pixels = size["longest_edge"]
+        self.size = size
+
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.do_convert_rgb = do_convert_rgb
+
+        # hard-code
+
+    def _preprocess(
+        self,
+        images: ImageInput | VideoInput,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int = 16,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        do_convert_rgb: bool | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. `shortest_edge` and `longest_edge` keys must be present.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """  # noqa: E501
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        width, height = images[0].width, images[0].height
+        resized_width, resized_height = width, height
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height=height,
+                    width=width,
+                    factor=patch_size * merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = image.resize((resized_width, resized_height))
+
+            if do_normalize:
+                image = transforms.Compose(
+                    [
+                        transforms.ToTensor(),
+                        transforms.Normalize(self.image_mean, self.image_std),
+                    ]
+                )(image)
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // temporal_patch_size
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        patches = patches.reshape(
+            1,
+            channel,
+            grid_h // merge_size,
+            merge_size,
+            patch_size,
+            grid_w // merge_size,
+            merge_size,
+            patch_size,
+        )
+        patches = patches.transpose(0, 2, 3, 5, 6, 1, 4, 7)
+        flatten_patches = patches.reshape(
+            1 * grid_h * grid_w, channel * patch_size * patch_size
+        )
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool | None = None,
+        size: dict[str, int] | None = None,
+        min_pixels: int | None = None,
+        max_pixels: int | None = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool | None = None,
+        rescale_factor: float | None = None,
+        do_normalize: bool | None = None,
+        image_mean: float | list[float] | None = None,
+        image_std: float | list[float] | None = None,
+        patch_size: int | None = None,
+        temporal_patch_size: int | None = None,
+        merge_size: int | None = None,
+        do_convert_rgb: bool | None = None,
+        return_tensors: str | TensorType | None = None,
+        data_format: ChannelDimension | None = ChannelDimension.FIRST,
+        input_data_format: str | ChannelDimension | None = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            min_pixels (`int`, *optional*, defaults to `self.min_pixels`):
+                The min pixels of the image to resize the image.
+            max_pixels (`int`, *optional*, defaults to `self.max_pixels`):
+                The max pixels of the image to resize the image.
+            patch_size (`int`, *optional*, defaults to `self.patch_size`):
+                The spatial patch size of the vision encoder.
+            temporal_patch_size (`int`, *optional*, defaults to `self.temporal_patch_size`):
+                The temporal patch size of the vision encoder.
+            merge_size (`int`, *optional*, defaults to `self.merge_size`):
+                The merge size of the vision encoder to llm encoder.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """  # noqa: E501
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+
+        if size is not None:
+            if "shortest_edge" not in size or "longest_edge" not in size:
+                raise ValueError(
+                    "size must contain 'shortest_edge' and 'longest_edge' keys."
+                )
+            min_pixels = size["shortest_edge"]
+        elif min_pixels is not None and max_pixels is not None:
+            # backward compatibility: override size with min_pixels and max_pixels
+            # if they are provided.
+            size = {"shortest_edge": min_pixels, "longest_edge": max_pixels}
+        else:
+            size = {**self.size}
+
+        do_resize = do_resize if do_resize is not None else self.do_resize
+
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        temporal_patch_size = (
+            temporal_patch_size
+            if temporal_patch_size is not None
+            else self.temporal_patch_size
+        )
+        merge_size = merge_size if merge_size is not None else self.merge_size
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+
+        if images is not None:
+            images = make_flat_list_of_images(images)
+
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        data = {}
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data.update(
+                {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+            )
+
+        # kept for BC only and should be removed after v5.0
+        if videos is not None:
+            logger.warning(
+                "`HunYuanVLV1ImageProcessor` works only with image inputs "
+                "and doesn't process videos anymore. "
+                "This is a deprecated behavior and will be removed in v5.0. "
+                "Your videos should be forwarded to `HunYuanVLV1VideoProcessor`. "
+            )
+            videos = make_batched_videos(videos)
+            pixel_values_videos, vision_grid_thws_videos = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    patch_size=patch_size,
+                    temporal_patch_size=temporal_patch_size,
+                    merge_size=merge_size,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values_videos.extend(patches)
+                vision_grid_thws_videos.append(video_grid_thw)
+            data.update(
+                {
+                    "pixel_values_videos": np.array(pixel_values_videos),
+                    "video_grid_thw": np.array(vision_grid_thws_videos),
+                }
+            )
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
+        """
+        A utility that returns number of image patches for a given image size.
+
+        Args:
+            height (`int`):
+                Height of the input image.
+            width (`int`):
+                Width of the input image.
+            images_kwargs (`dict`, *optional*):
+                Any kwargs to override defaults of the image processor.
+        Returns:
+            `int`: Number of image patches per image.
+        """
+        min_pixels = (
+            images_kwargs["min_pixels"]
+            if "min_pixels" in images_kwargs
+            else self.size["shortest_edge"]
+        )
+        max_pixels = (
+            images_kwargs["max_pixels"]
+            if "max_pixels" in images_kwargs
+            else self.size["longest_edge"]
+        )
+        patch_size = images_kwargs.get("patch_size", self.patch_size)
+        merge_size = images_kwargs.get("merge_size", self.merge_size)
+
+        factor = patch_size * merge_size
+        resized_height, resized_width = smart_resize(
+            height, width, factor, min_pixels=min_pixels, max_pixels=max_pixels
+        )
+        grid_h, grid_w = resized_height // patch_size, resized_width // patch_size
+        return grid_h * (grid_w + 1) + 2
+
+
+AutoImageProcessor.register("HunYuanVLImageProcessor", HunYuanVLImageProcessor)
diff --git a/vllm/transformers_utils/repo_utils.py b/vllm/transformers_utils/repo_utils.py
new file mode 100644
index 000000000000..b63288914cf8
--- /dev/null
+++ b/vllm/transformers_utils/repo_utils.py
@@ -0,0 +1,287 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Utilities for model repo interaction."""
+
+import fnmatch
+import json
+import os
+import time
+from collections.abc import Callable
+from functools import cache
+from pathlib import Path
+from typing import TypeVar
+
+import huggingface_hub
+from huggingface_hub import (
+    hf_hub_download,
+    try_to_load_from_cache,
+)
+from huggingface_hub import list_repo_files as hf_list_repo_files
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+
+from vllm import envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def _get_hf_token() -> str | None:
+    """
+    Get the HuggingFace token from environment variable.
+
+    Returns None if the token is not set, is an empty string,
+    or contains only whitespace.
+    This follows the same pattern as huggingface_hub library which
+    treats empty string tokens as None to avoid authentication errors.
+    """
+    token = os.getenv("HF_TOKEN")
+    if token and token.strip():
+        return token
+    return None
+
+
+_R = TypeVar("_R")
+
+
+def with_retry(
+    func: Callable[[], _R],
+    log_msg: str,
+    max_retries: int = 2,
+    retry_delay: int = 2,
+) -> _R:
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logger.error("%s: %s", log_msg, e)
+                raise
+            logger.error(
+                "%s: %s, retrying %d of %d", log_msg, e, attempt + 1, max_retries
+            )
+            time.sleep(retry_delay)
+            retry_delay *= 2
+
+    raise AssertionError("Should not be reached")
+
+
+# @cache doesn't cache exceptions
+@cache
+def list_repo_files(
+    repo_id: str,
+    *,
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    def lookup_files() -> list[str]:
+        # directly list files if model is local
+        if (local_path := Path(repo_id)).exists():
+            return [
+                str(file.relative_to(local_path))
+                for file in local_path.rglob("*")
+                if file.is_file()
+            ]
+        # if model is remote, use hf_hub api to list files
+        try:
+            if envs.VLLM_USE_MODELSCOPE:
+                from vllm.transformers_utils.utils import modelscope_list_repo_files
+
+                return modelscope_list_repo_files(
+                    repo_id,
+                    revision=revision,
+                    token=os.getenv("MODELSCOPE_API_TOKEN", None),
+                )
+            return hf_list_repo_files(
+                repo_id, revision=revision, repo_type=repo_type, token=token
+            )
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            # Don't raise in offline mode,
+            # all we know is that we don't have this
+            # file cached.
+            return []
+
+    return with_retry(lookup_files, "Error retrieving file list")
+
+
+def list_filtered_repo_files(
+    model_name_or_path: str,
+    allow_patterns: list[str],
+    revision: str | None = None,
+    repo_type: str | None = None,
+    token: str | bool | None = None,
+) -> list[str]:
+    try:
+        all_files = list_repo_files(
+            repo_id=model_name_or_path,
+            revision=revision,
+            token=token,
+            repo_type=repo_type,
+        )
+    except Exception:
+        logger.error(
+            "Error retrieving file list. Please ensure your `model_name_or_path`"
+            "`repo_type`, `token` and `revision` arguments are correctly set. "
+            "Returning an empty list."
+        )
+        return []
+
+    file_list = []
+    # Filter patterns on filenames
+    for pattern in allow_patterns:
+        file_list.extend(
+            [
+                file
+                for file in all_files
+                if fnmatch.fnmatch(os.path.basename(file), pattern)
+            ]
+        )
+    return file_list
+
+
+def file_exists(
+    repo_id: str,
+    file_name: str,
+    *,
+    repo_type: str | None = None,
+    revision: str | None = None,
+    token: str | bool | None = None,
+) -> bool:
+    file_list = list_repo_files(
+        repo_id, repo_type=repo_type, revision=revision, token=token
+    )
+    return file_name in file_list
+
+
+# In offline mode the result can be a false negative
+def file_or_path_exists(
+    model: str | Path, config_name: str, revision: str | None
+) -> bool:
+    if (local_path := Path(model)).exists():
+        return (local_path / config_name).is_file()
+
+    # Offline mode support: Check if config file is cached already
+    cached_filepath = try_to_load_from_cache(
+        repo_id=model, filename=config_name, revision=revision
+    )
+    if isinstance(cached_filepath, str):
+        # The config file exists in cache - we can continue trying to load
+        return True
+
+    # NB: file_exists will only check for the existence of the config file on
+    # hf_hub. This will fail in offline mode.
+
+    # Call HF to check if the file exists
+    return file_exists(
+        str(model), config_name, revision=revision, token=_get_hf_token()
+    )
+
+
+def get_model_path(model: str | Path, revision: str | None = None):
+    if os.path.exists(model):
+        return model
+    assert huggingface_hub.constants.HF_HUB_OFFLINE
+    common_kwargs = {
+        "local_files_only": huggingface_hub.constants.HF_HUB_OFFLINE,
+        "revision": revision,
+    }
+
+    if envs.VLLM_USE_MODELSCOPE:
+        from modelscope.hub.snapshot_download import snapshot_download
+
+        return snapshot_download(model_id=model, **common_kwargs)
+
+    from huggingface_hub import snapshot_download
+
+    return snapshot_download(repo_id=model, **common_kwargs)
+
+
+def get_hf_file_bytes(
+    file_name: str, model: str | Path, revision: str | None = "main"
+) -> bytes | None:
+    """Get file contents from HuggingFace repository as bytes."""
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
+
+    if file_path is None:
+        hf_hub_file = hf_hub_download(
+            model, file_name, revision=revision, token=_get_hf_token()
+        )
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path, "rb") as file:
+            return file.read()
+
+    return None
+
+
+def try_get_local_file(
+    model: str | Path, file_name: str, revision: str | None = "main"
+) -> Path | None:
+    file_path = Path(model) / file_name
+    if file_path.is_file():
+        return file_path
+    else:
+        try:
+            cached_filepath = try_to_load_from_cache(
+                repo_id=model, filename=file_name, revision=revision
+            )
+            if isinstance(cached_filepath, str):
+                return Path(cached_filepath)
+        except ValueError:
+            ...
+    return None
+
+
+def get_hf_file_to_dict(
+    file_name: str, model: str | Path, revision: str | None = "main"
+):
+    """
+    Downloads a file from the Hugging Face Hub and returns
+    its contents as a dictionary.
+
+    Parameters:
+    - file_name (str): The name of the file to download.
+    - model (str): The name of the model on the Hugging Face Hub.
+    - revision (str): The specific version of the model.
+
+    Returns:
+    - config_dict (dict): A dictionary containing
+    the contents of the downloaded file.
+    """
+
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
+
+    if file_path is None:
+        try:
+            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
+        except (
+            RepositoryNotFoundError,
+            RevisionNotFoundError,
+            EntryNotFoundError,
+            LocalEntryNotFoundError,
+        ) as e:
+            logger.debug("File or repository not found in hf_hub_download", e)
+            return None
+        except HfHubHTTPError as e:
+            logger.warning(
+                "Cannot connect to Hugging Face Hub. Skipping file download for '%s':",
+                file_name,
+                exc_info=e,
+            )
+            return None
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path) as file:
+            return json.load(file)
+
+    return None
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index a393568909d2..0911848c02e1 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,38 +1,51 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import contextlib
-import copy
-import os
 import warnings
 from functools import lru_cache
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeAlias
+from typing import TYPE_CHECKING, Any
 
-import huggingface_hub
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 from typing_extensions import assert_never
 
-from vllm import envs
 from vllm.logger import init_logger
-from vllm.transformers_utils.config import get_sentence_transformer_tokenizer_config
-from vllm.transformers_utils.tokenizers import MistralTokenizer
-from vllm.transformers_utils.utils import check_gguf_file
+from vllm.tokenizers import TokenizerLike, get_tokenizer
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
-    from vllm.transformers_utils.tokenizer_base import TokenizerBase
-else:
-    ModelConfig = Any
-    TokenizerBase = Any
+
 
 logger = init_logger(__name__)
 
-AnyTokenizer: TypeAlias = PreTrainedTokenizer | PreTrainedTokenizerFast | TokenizerBase
+
+def __getattr__(name: str):
+    if name == "AnyTokenizer":
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to "
+            "`vllm.tokenizers.TokenizerLike`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return TokenizerLike
+    if name == "get_cached_tokenizer":
+        from vllm.tokenizers.hf import get_cached_tokenizer
+
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer.get_cached_tokenizer` "
+            "has been moved to `vllm.tokenizers.hf.get_cached_tokenizer`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+
+        return get_cached_tokenizer
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
 def decode_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     token_ids: list[int],
     *,
     skip_special_tokens: bool | None = None,
@@ -44,14 +57,16 @@ def decode_tokens(
     `skip_special_tokens=None` means to use the backend's default
     settings.
     """
+    kw_args: dict[str, Any] = {}
+
     if skip_special_tokens is not None:
-        return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
+        kw_args["skip_special_tokens"] = skip_special_tokens
 
-    return tokenizer.decode(token_ids)
+    return tokenizer.decode(token_ids, **kw_args)
 
 
 def encode_tokens(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     text: str,
     *,
     truncation: bool | None = None,
@@ -79,191 +94,11 @@ def encode_tokens(
     return tokenizer.encode(text, **kw_args)
 
 
-def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer:
-    """
-    By default, transformers will recompute multiple tokenizer properties
-    each time they are called, leading to a significant slowdown.
-    This proxy caches these properties for faster access.
-    """
-    cached_tokenizer = copy.copy(tokenizer)
-
-    tokenizer_all_special_ids = tokenizer.all_special_ids
-    tokenizer_all_special_tokens = tokenizer.all_special_tokens
-    tokenizer_all_special_tokens_extended = tokenizer.all_special_tokens_extended
-    tokenizer_vocab = tokenizer.get_vocab()
-    tokenizer_len = len(tokenizer)
-
-    max_token_id = max(tokenizer_vocab.values())
-    # Some tokenizers (e.g., QwenTokenizer) have special tokens that
-    # are added and included in the implementation of the vocab_size
-    # property, but not in get_vocab(); if there is an implementation
-    # of vocab size, we should take the greater value.
-    if hasattr(tokenizer, "vocab_size"):
-        with contextlib.suppress(NotImplementedError):
-            max_token_id = max(max_token_id, tokenizer.vocab_size)
-
-    class CachedTokenizer(tokenizer.__class__):  # type: ignore
-        @property
-        def all_special_ids(self) -> list[int]:
-            return tokenizer_all_special_ids
-
-        @property
-        def all_special_tokens(self) -> list[str]:
-            return tokenizer_all_special_tokens
-
-        @property
-        def all_special_tokens_extended(self) -> list[str]:
-            return tokenizer_all_special_tokens_extended
-
-        @property
-        def max_token_id(self) -> int:
-            return max_token_id
-
-        def get_vocab(self) -> dict[str, int]:
-            return tokenizer_vocab
-
-        def __len__(self) -> int:
-            return tokenizer_len
-
-        def __reduce__(self):
-            return get_cached_tokenizer, (tokenizer,)
-
-    CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
-
-    cached_tokenizer.__class__ = CachedTokenizer
-    return cached_tokenizer
-
-
-def get_tokenizer(
-    tokenizer_name: str | Path,
-    *args,
-    tokenizer_mode: str = "auto",
-    trust_remote_code: bool = False,
-    revision: str | None = None,
-    download_dir: str | None = None,
-    **kwargs,
-) -> AnyTokenizer:
-    """Gets a tokenizer for the given model name via HuggingFace or ModelScope."""
-    if envs.VLLM_USE_MODELSCOPE:
-        # download model from ModelScope hub,
-        # lazy import so that modelscope is not required for normal use.
-        # pylint: disable=C.
-        from modelscope.hub.snapshot_download import snapshot_download
-
-        # avoid circuit import
-        from vllm.model_executor.model_loader.weight_utils import get_lock
-
-        # Only set the tokenizer here, model will be downloaded on the workers.
-        if not os.path.exists(tokenizer_name):
-            # Use file lock to prevent multiple processes from
-            # downloading the same file at the same time.
-            with get_lock(tokenizer_name, download_dir):
-                tokenizer_path = snapshot_download(
-                    model_id=tokenizer_name,
-                    cache_dir=download_dir,
-                    revision=revision,
-                    local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
-                    # Ignore weights - we only need the tokenizer.
-                    ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
-                )
-                tokenizer_name = tokenizer_path
-
-    if tokenizer_mode == "slow":
-        if kwargs.get("use_fast", False):
-            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
-        kwargs["use_fast"] = False
-
-    if "truncation_side" not in kwargs:
-        kwargs["truncation_side"] = "left"
-
-    # Separate model folder from file path for GGUF models
-    is_gguf = check_gguf_file(tokenizer_name)
-    if is_gguf:
-        kwargs["gguf_file"] = Path(tokenizer_name).name
-        tokenizer_name = Path(tokenizer_name).parent
-
-    # if tokenizer is from official mistral org
-    is_from_mistral_org = str(tokenizer_name).split("/")[0] == "mistralai"
-    if is_from_mistral_org and tokenizer_mode != "mistral":
-        warnings.warn(
-            "It is strongly recommended to run mistral models with "
-            '`--tokenizer-mode "mistral"` to ensure correct '
-            "encoding and decoding.",
-            FutureWarning,
-            stacklevel=2,
-        )
-
-    tokenizer: AnyTokenizer
-    if tokenizer_mode == "mistral":
-        tokenizer = MistralTokenizer.from_pretrained(
-            str(tokenizer_name), revision=revision
-        )
-    elif tokenizer_mode == "custom":
-        from vllm.transformers_utils.tokenizer_base import TokenizerRegistry
-
-        tokenizer = TokenizerRegistry.get_tokenizer(
-            str(tokenizer_name),
-            *args,
-            revision=revision,
-            download_dir=download_dir,
-            **kwargs,
-        )
-    else:
-        try:
-            tokenizer = AutoTokenizer.from_pretrained(
-                tokenizer_name,
-                *args,
-                trust_remote_code=trust_remote_code,
-                revision=revision,
-                **kwargs,
-            )
-        except ValueError as e:
-            # If the error pertains to the tokenizer class not existing or not
-            # currently being imported,
-            # suggest using the --trust-remote-code flag.
-            if not trust_remote_code and (
-                "does not exist or is not currently imported." in str(e)
-                or "requires you to execute the tokenizer file" in str(e)
-            ):
-                err_msg = (
-                    "Failed to load the tokenizer. If the tokenizer "
-                    "is a custom tokenizer not yet available in the "
-                    "HuggingFace transformers library, consider "
-                    "setting `trust_remote_code=True` in LLM or using "
-                    "the `--trust-remote-code` flag in the CLI."
-                )
-                raise RuntimeError(err_msg) from e
-            else:
-                raise e
-
-        # The special_tokens in tokenizer should also be
-        # controlled by do_lower_case in encoder_config
-        encoder_config = get_sentence_transformer_tokenizer_config(
-            tokenizer_name, revision
-        )
-        if isinstance(encoder_config, dict) and encoder_config.get(
-            "do_lower_case", False
-        ):
-            special_tokens_map = {
-                k: v.lower() for k, v in tokenizer.special_tokens_map.items()
-            }
-            tokenizer.add_special_tokens(special_tokens_map)
-
-        if not isinstance(tokenizer, PreTrainedTokenizerFast):
-            logger.warning(
-                "Using a slow tokenizer. This might cause a significant "
-                "slowdown. Consider using a fast tokenizer instead."
-            )
-        tokenizer = get_cached_tokenizer(tokenizer)
-
-    return tokenizer
-
-
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
 def cached_tokenizer_from_config(
-    model_config: ModelConfig,
+    model_config: "ModelConfig",
     **kwargs: Any,
 ):
     return cached_get_tokenizer(
@@ -275,7 +110,7 @@ def cached_tokenizer_from_config(
     )
 
 
-def init_tokenizer_from_configs(model_config: ModelConfig):
+def init_tokenizer_from_configs(model_config: "ModelConfig"):
     runner_type = model_config.runner_type
     if runner_type == "generate" or runner_type == "draft":
         truncation_side = "left"
diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py
index 7421eb534808..78fb6edc8b9e 100644
--- a/vllm/transformers_utils/tokenizer_base.py
+++ b/vllm/transformers_utils/tokenizer_base.py
@@ -1,155 +1,33 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import importlib
-from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any
+import warnings
 
-if TYPE_CHECKING:
-    from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 
+def __getattr__(name: str):
+    if name == "TokenizerBase":
+        from vllm.tokenizers import TokenizerLike
 
-class TokenizerBase(ABC):
-    @property
-    @abstractmethod
-    def all_special_tokens_extended(self) -> list[str]:
-        raise NotImplementedError()
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been "
+            "moved to `vllm.tokenizers.TokenizerLike`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
-    @property
-    @abstractmethod
-    def all_special_tokens(self) -> list[str]:
-        raise NotImplementedError()
+        return TokenizerLike
+    if name == "TokenizerRegistry":
+        from vllm.tokenizers import TokenizerRegistry
 
-    @property
-    @abstractmethod
-    def all_special_ids(self) -> list[int]:
-        raise NotImplementedError()
+        warnings.warn(
+            "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been "
+            "moved to `vllm.tokenizers.TokenizerRegistry`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
 
-    @property
-    @abstractmethod
-    def bos_token_id(self) -> int:
-        raise NotImplementedError()
+        return TokenizerRegistry
 
-    @property
-    @abstractmethod
-    def eos_token_id(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def sep_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def pad_token(self) -> str:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def is_fast(self) -> bool:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def vocab_size(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def max_token_id(self) -> int:
-        raise NotImplementedError()
-
-    @property
-    @abstractmethod
-    def truncation_side(self) -> str:
-        raise NotImplementedError()
-
-    def __len__(self) -> int:
-        return self.vocab_size
-
-    @abstractmethod
-    def __call__(
-        self,
-        text: str | list[str] | list[int],
-        text_pair: str | None = None,
-        add_special_tokens: bool = False,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ):
-        raise NotImplementedError()
-
-    @abstractmethod
-    def get_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def get_added_vocab(self) -> dict[str, int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def encode_one(
-        self,
-        text: str,
-        truncation: bool = False,
-        max_length: int | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def encode(
-        self,
-        text: str,
-        truncation: bool | None = None,
-        max_length: int | None = None,
-        add_special_tokens: bool | None = None,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def apply_chat_template(
-        self,
-        messages: list["ChatCompletionMessageParam"],
-        tools: list[dict[str, Any]] | None = None,
-        **kwargs,
-    ) -> list[int]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def convert_tokens_to_string(self, tokens: list[str]) -> str:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def decode(self, ids: list[int] | int, skip_special_tokens: bool = True) -> str:
-        raise NotImplementedError()
-
-    @abstractmethod
-    def convert_ids_to_tokens(
-        self,
-        ids: list[int],
-        skip_special_tokens: bool = True,
-    ) -> list[str]:
-        raise NotImplementedError()
-
-
-class TokenizerRegistry:
-    # Tokenizer name -> (tokenizer module, tokenizer class)
-    REGISTRY: dict[str, tuple[str, str]] = {}
-
-    @staticmethod
-    def register(name: str, module: str, class_name: str) -> None:
-        TokenizerRegistry.REGISTRY[name] = (module, class_name)
-
-    @staticmethod
-    def get_tokenizer(
-        tokenizer_name: str,
-        *args,
-        **kwargs,
-    ) -> TokenizerBase:
-        tokenizer_cls = TokenizerRegistry.REGISTRY.get(tokenizer_name)
-        if tokenizer_cls is None:
-            raise ValueError(f"Tokenizer {tokenizer_name} not found.")
-
-        tokenizer_module = importlib.import_module(tokenizer_cls[0])
-        class_ = getattr(tokenizer_module, tokenizer_cls[1])
-        return class_.from_pretrained(*args, **kwargs)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
deleted file mode 100644
index b63cb26af46d..000000000000
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from .mistral import (
-    MistralTokenizer,
-    maybe_serialize_tool_calls,
-    truncate_tool_call_ids,
-    validate_request_params,
-)
-
-__all__ = [
-    "MistralTokenizer",
-    "maybe_serialize_tool_calls",
-    "truncate_tool_call_ids",
-    "validate_request_params",
-]
diff --git a/vllm/transformers_utils/utils.py b/vllm/transformers_utils/utils.py
index 1ae42ba622dc..45a873c9f700 100644
--- a/vllm/transformers_utils/utils.py
+++ b/vllm/transformers_utils/utils.py
@@ -9,6 +9,8 @@
 from pathlib import Path
 from typing import Any
 
+from gguf import GGMLQuantizationType
+
 import vllm.envs as envs
 from vllm.logger import init_logger
 
@@ -27,6 +29,7 @@ def is_cloud_storage(model_or_path: str) -> bool:
     return is_s3(model_or_path) or is_gcs(model_or_path)
 
 
+@cache
 def check_gguf_file(model: str | PathLike) -> bool:
     """Check if the file is a GGUF model."""
     model = Path(model)
@@ -45,6 +48,57 @@ def check_gguf_file(model: str | PathLike) -> bool:
         return False
 
 
+@cache
+def is_remote_gguf(model: str | Path) -> bool:
+    """Check if the model is a remote GGUF model."""
+    model = str(model)
+    return (
+        (not is_cloud_storage(model))
+        and (not model.startswith(("http://", "https://")))
+        and ("/" in model and ":" in model)
+        and is_valid_gguf_quant_type(model.rsplit(":", 1)[1])
+    )
+
+
+def is_valid_gguf_quant_type(gguf_quant_type: str) -> bool:
+    """Check if the quant type is a valid GGUF quant type."""
+    return getattr(GGMLQuantizationType, gguf_quant_type, None) is not None
+
+
+def split_remote_gguf(model: str | Path) -> tuple[str, str]:
+    """Split the model into repo_id and quant type."""
+    model = str(model)
+    if is_remote_gguf(model):
+        parts = model.rsplit(":", 1)
+        return (parts[0], parts[1])
+    raise ValueError(
+        "Wrong GGUF model or invalid GGUF quant type: %s.\n"
+        "- It should be in repo_id:quant_type format.\n"
+        "- Valid GGMLQuantizationType values: %s",
+        model,
+        GGMLQuantizationType._member_names_,
+    )
+
+
+def is_gguf(model: str | Path) -> bool:
+    """Check if the model is a GGUF model.
+
+    Args:
+        model: Model name, path, or Path object to check.
+
+    Returns:
+        True if the model is a GGUF model, False otherwise.
+    """
+    model = str(model)
+
+    # Check if it's a local GGUF file
+    if check_gguf_file(model):
+        return True
+
+    # Check if it's a remote GGUF model (repo_id:quant_type format)
+    return is_remote_gguf(model)
+
+
 def modelscope_list_repo_files(
     repo_id: str,
     revision: str | None = None,
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index a475d0fa406b..ce459ca91d8e 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
 
 from vllm.triton_utils.importing import (
     HAS_TRITON,
@@ -7,7 +8,7 @@
     TritonPlaceholder,
 )
 
-if HAS_TRITON:
+if TYPE_CHECKING or HAS_TRITON:
     import triton
     import triton.language as tl
     import triton.language.extra.libdevice as tldevice
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
index 9b0045279a67..25e7978c70fa 100644
--- a/vllm/utils/__init__.py
+++ b/vllm/utils/__init__.py
@@ -1,16 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import inspect
 import uuid
 import warnings
-from functools import wraps
-from typing import Any, TypeVar
+from typing import Any
 
 import torch
 
-from vllm.logger import init_logger
-
 _DEPRECATED_MAPPINGS = {
     "cprofile": "profiling",
     "cprofile_context": "profiling",
@@ -39,77 +35,11 @@ def __dir__() -> list[str]:
     return sorted(list(globals().keys()) + list(_DEPRECATED_MAPPINGS.keys()))
 
 
-logger = init_logger(__name__)
-
-# This value is chosen to have a balance between ITL and TTFT. Note it is
-# not optimized for throughput.
-DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
-POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
-MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
-
-# Constants related to forcing the attention backend selection
-
-# String name of register which may be set in order to
-# force auto-selection of attention backend by Attention
-# wrapper
-STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
-
-# Possible string values of STR_BACKEND_ENV_VAR
-# register, corresponding to possible backends
-STR_FLASHINFER_ATTN_VAL: str = "FLASHINFER"
-STR_XFORMERS_ATTN_VAL: str = "XFORMERS"
-STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
-STR_INVALID_VAL: str = "INVALID"
-
-
-T = TypeVar("T")
+MASK_64_BITS = (1 << 64) - 1
 
 
 def random_uuid() -> str:
-    return str(uuid.uuid4().hex)
-
-
-def warn_for_unimplemented_methods(cls: type[T]) -> type[T]:
-    """
-    A replacement for `abc.ABC`.
-    When we use `abc.ABC`, subclasses will fail to instantiate
-    if they do not implement all abstract methods.
-    Here, we only require `raise NotImplementedError` in the
-    base class, and log a warning if the method is not implemented
-    in the subclass.
-    """
-
-    original_init = cls.__init__
-
-    def find_unimplemented_methods(self: object):
-        unimplemented_methods = []
-        for attr_name in dir(self):
-            # bypass inner method
-            if attr_name.startswith("_"):
-                continue
-
-            try:
-                attr = getattr(self, attr_name)
-                # get the func of callable method
-                if callable(attr):
-                    attr_func = attr.__func__
-            except AttributeError:
-                continue
-            src = inspect.getsource(attr_func)
-            if "NotImplementedError" in src:
-                unimplemented_methods.append(attr_name)
-        if unimplemented_methods:
-            method_names = ",".join(unimplemented_methods)
-            msg = f"Methods {method_names} not implemented in {self}"
-            logger.debug(msg)
-
-    @wraps(original_init)
-    def wrapped_init(self, *args, **kwargs) -> None:
-        original_init(self, *args, **kwargs)
-        find_unimplemented_methods(self)
-
-    type.__setattr__(cls, "__init__", wrapped_init)
-    return cls
+    return f"{uuid.uuid4().int & MASK_64_BITS:016x}"  # 16 hex chars
 
 
 def length_from_prompt_token_ids_or_embeds(
diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py
index 3d105a3685b3..555fcfea491e 100644
--- a/vllm/utils/argparse_utils.py
+++ b/vllm/utils/argparse_utils.py
@@ -73,14 +73,6 @@ def __init__(self, *args, **kwargs):
         # Enable the deprecated kwarg for Python 3.12 and below
 
         def parse_known_args(self, args=None, namespace=None):
-            if args is not None and "--disable-log-requests" in args:
-                # Special case warning because the warning below won't trigger
-                # if –-disable-log-requests because its value is default.
-                logger.warning_once(
-                    "argument '--disable-log-requests' is deprecated and "
-                    "replaced with '--enable-log-requests'. This will be "
-                    "removed in v0.12.0."
-                )
             namespace, args = super().parse_known_args(args, namespace)
             for action in FlexibleArgumentParser._deprecated:
                 if (
@@ -255,16 +247,27 @@ def repl(match: re.Match) -> str:
             elif arg.startswith("-O") and arg != "-O" and arg[2] != ".":
                 # allow -O flag to be used without space, e.g. -O3 or -Odecode
                 # -O.<...> handled later
-                # also handle -O=<mode> here
-                mode = arg[3:] if arg[2] == "=" else arg[2:]
-                processed_args.append(f"-O.mode={mode}")
+                # also handle -O=<optimization_level> here
+                optimization_level = arg[3:] if arg[2] == "=" else arg[2:]
+                processed_args += ["--optimization-level", optimization_level]
             elif (
                 arg == "-O"
                 and i + 1 < len(args)
                 and args[i + 1] in {"0", "1", "2", "3"}
             ):
-                # Convert -O <n> to -O.mode <n>
-                processed_args.append("-O.mode")
+                # Convert -O <n> to --optimization-level <n>
+                processed_args.append("--optimization-level")
+            elif arg.startswith("-O."):
+                # Handle -O.* dotted syntax - ALL dotted syntax is deprecated
+                logger.warning_once(
+                    "The -O.* dotted syntax for --compilation-config is "
+                    "deprecated and will be removed in v0.13.0 or v1.0.0"
+                    ", whichever is earlier.  Please use -cc.* instead. "
+                    "Example: -cc.backend=eager instead of "
+                    "-O.backend=eager."
+                )
+                converted_arg = arg.replace("-O", "-cc", 1)
+                processed_args.append(converted_arg)
             else:
                 processed_args.append(arg)
 
@@ -302,10 +305,24 @@ def recursive_dict_update(
         delete = set[int]()
         dict_args = defaultdict[str, dict[str, Any]](dict)
         duplicates = set[str]()
+        # Track regular arguments (non-dict args) for duplicate detection
+        regular_args_seen = set[str]()
         for i, processed_arg in enumerate(processed_args):
             if i in delete:  # skip if value from previous arg
                 continue
 
+            if processed_arg.startswith("--") and "." not in processed_arg:
+                if "=" in processed_arg:
+                    arg_name = processed_arg.split("=", 1)[0]
+                else:
+                    arg_name = processed_arg
+
+                if arg_name in regular_args_seen:
+                    duplicates.add(arg_name)
+                else:
+                    regular_args_seen.add(arg_name)
+                continue
+
             if processed_arg.startswith("-") and "." in processed_arg:
                 if "=" in processed_arg:
                     processed_arg, value_str = processed_arg.split("=", 1)
diff --git a/vllm/utils/async_utils.py b/vllm/utils/async_utils.py
index b6c24e1ceeee..77234cbd0c8c 100644
--- a/vllm/utils/async_utils.py
+++ b/vllm/utils/async_utils.py
@@ -12,7 +12,7 @@
 from collections.abc import AsyncGenerator, Awaitable, Callable
 from concurrent.futures import Executor, ThreadPoolExecutor
 from functools import partial
-from typing import TypeVar
+from typing import TYPE_CHECKING, TypeVar
 
 from transformers.tokenization_utils_base import BatchEncoding
 from typing_extensions import ParamSpec
@@ -257,6 +257,13 @@ def in_loop(event_loop: AbstractEventLoop) -> bool:
         return False
 
 
+# A hack to pass mypy
+if TYPE_CHECKING:
+
+    def anext(it: AsyncGenerator[T, None]):
+        return it.__anext__()
+
+
 async def merge_async_iterators(
     *iterators: AsyncGenerator[T, None],
 ) -> AsyncGenerator[tuple[int, T], None]:
diff --git a/vllm/utils/collection_utils.py b/vllm/utils/collection_utils.py
index 57271311828c..3b19e1bd7819 100644
--- a/vllm/utils/collection_utils.py
+++ b/vllm/utils/collection_utils.py
@@ -6,64 +6,37 @@
 This is similar in concept to the `collections` module.
 """
 
-from collections import UserDict, defaultdict
+from collections import defaultdict
 from collections.abc import Callable, Generator, Hashable, Iterable, Mapping
 from typing import Generic, Literal, TypeVar
 
 from typing_extensions import TypeIs, assert_never
 
 T = TypeVar("T")
-U = TypeVar("U")
 
 _K = TypeVar("_K", bound=Hashable)
 _V = TypeVar("_V")
 
 
-class ClassRegistry(UserDict[type[T], _V]):
-    """
-    A registry that acts like a dictionary but searches for other classes
-    in the MRO if the original class is not found.
-    """
-
-    def __getitem__(self, key: type[T]) -> _V:
-        for cls in key.mro():
-            if cls in self.data:
-                return self.data[cls]
-
-        raise KeyError(key)
-
-    def __contains__(self, key: object) -> bool:
-        return self.contains(key)
-
-    def contains(self, key: object, *, strict: bool = False) -> bool:
-        if not isinstance(key, type):
-            return False
-
-        if strict:
-            return key in self.data
-
-        return any(cls in self.data for cls in key.mro())
-
-
-class LazyDict(Mapping[str, T], Generic[T]):
+class LazyDict(Mapping[str, _V], Generic[_V]):
     """
     Evaluates dictionary items only when they are accessed.
 
     Adapted from: https://stackoverflow.com/a/47212782/5082708
     """
 
-    def __init__(self, factory: dict[str, Callable[[], T]]):
+    def __init__(self, factory: dict[str, Callable[[], _V]]):
         self._factory = factory
-        self._dict: dict[str, T] = {}
+        self._dict: dict[str, _V] = {}
 
-    def __getitem__(self, key: str) -> T:
+    def __getitem__(self, key: str) -> _V:
         if key not in self._dict:
             if key not in self._factory:
                 raise KeyError(key)
             self._dict[key] = self._factory[key]()
         return self._dict[key]
 
-    def __setitem__(self, key: str, value: Callable[[], T]):
+    def __setitem__(self, key: str, value: Callable[[], _V]):
         self._factory[key] = value
 
     def __iter__(self):
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index 4c15baf7a8f9..b25c1e3e1ece 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -9,6 +9,7 @@
 import importlib
 import os
 from collections.abc import Callable
+from enum import Enum
 from typing import Any, NoReturn
 
 import torch
@@ -20,6 +21,28 @@
 from vllm.utils.math_utils import cdiv
 
 
+class DeepGemmQuantScaleFMT(Enum):
+    # Float32 scales in Float32 tensor
+    FLOAT32 = 0
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Keep the scales in Float32 tensor.
+    FLOAT32_CEIL_UE8M0 = 1
+    # Compute float32 scales and ceil the scales to UE8M0.
+    # Pack the scales into a int32 tensor where each int32
+    # element contains 4 scale values.
+    UE8M0 = 2
+
+    @staticmethod
+    def from_oracle() -> "DeepGemmQuantScaleFMT":
+        if not is_deep_gemm_e8m0_used():
+            return DeepGemmQuantScaleFMT.FLOAT32
+        return (
+            DeepGemmQuantScaleFMT.UE8M0
+            if current_platform.is_device_capability(100)
+            else DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
+        )
+
+
 @functools.cache
 def is_deep_gemm_supported() -> bool:
     """Return `True` if DeepGEMM is supported on the current platform.
@@ -302,6 +325,7 @@ def _align(x: int, y: int) -> int:
 def per_block_cast_to_fp8(
     x: torch.Tensor, block_size: list[int] = DEFAULT_BLOCK_SIZE, use_ue8m0: bool = False
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    fp8_dtype = current_platform.fp8_dtype()
     assert x.dim() == 2
     m, n = x.shape
     block_m, block_n = block_size
@@ -311,9 +335,9 @@ def per_block_cast_to_fp8(
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    sf = x_amax / 448.0
+    sf = x_amax / 224.0 if current_platform.is_fp8_fnuz() else x_amax / 448.0
     sf = _ceil_to_ue8m0(sf) if use_ue8m0 else sf
-    x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
+    x_scaled = (x_view * (1.0 / sf)).to(fp8_dtype)
     return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
         x_view.size(0), x_view.size(2)
     )
@@ -342,11 +366,18 @@ def should_use_deepgemm_for_fp8_linear(
 ):
     if supports_deep_gemm is None:
         supports_deep_gemm = is_deep_gemm_supported()
+
+    # Verify DeepGEMM N/K dims requirements
+    # NOTE: Also synchronized with test_w8a8_block_fp8_deep_gemm_matmul
+    # test inside kernels/quatization/test_block_fp8.py
+    N_MULTIPLE = 64
+    K_MULTIPLE = 128
+
     return (
         supports_deep_gemm
         and output_dtype == torch.bfloat16
-        and weight.shape[0] % 128 == 0
-        and weight.shape[1] % 128 == 0
+        and weight.shape[0] % N_MULTIPLE == 0
+        and weight.shape[1] % K_MULTIPLE == 0
     )
 
 
diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 62af39513d65..9f9976d52b4a 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -114,7 +114,17 @@ def wrapper(*args, **kwargs):
 flashinfer_cutlass_fused_moe = _lazy_import_wrapper(
     "flashinfer.fused_moe", "cutlass_fused_moe"
 )
+flashinfer_cutedsl_grouped_gemm_nt_masked = _lazy_import_wrapper(
+    "flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"
+)
 flashinfer_fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize")
+nvfp4_batched_quantize = _lazy_import_wrapper("flashinfer", "nvfp4_batched_quantize")
+silu_and_mul_scaled_nvfp4_experts_quantize = _lazy_import_wrapper(
+    "flashinfer", "silu_and_mul_scaled_nvfp4_experts_quantize"
+)
+scaled_fp4_grouped_quantize = _lazy_import_wrapper(
+    "flashinfer", "scaled_fp4_grouped_quantize"
+)
 nvfp4_block_scale_interleave = _lazy_import_wrapper(
     "flashinfer", "nvfp4_block_scale_interleave"
 )
@@ -166,6 +176,14 @@ def has_flashinfer_moe() -> bool:
     )
 
 
+@functools.cache
+def has_flashinfer_cutedsl() -> bool:
+    """Return ``True`` if FlashInfer cutedsl module is available."""
+    return (
+        has_flashinfer() and importlib.util.find_spec("flashinfer.cute_dsl") is not None
+    )
+
+
 @functools.cache
 def has_flashinfer_cutlass_fused_moe() -> bool:
     """Return `True` if FlashInfer CUTLASS fused MoE is available."""
@@ -187,6 +205,26 @@ def has_flashinfer_cutlass_fused_moe() -> bool:
     return True
 
 
+@functools.cache
+def has_flashinfer_cutedsl_grouped_gemm_nt_masked() -> bool:
+    """Return ``True`` if FlashInfer CUTLASS fused MoE is available."""
+    if not has_flashinfer_cutedsl():
+        return False
+
+    # Check if all required functions are available
+    required_functions = [
+        ("flashinfer.cute_dsl.blockscaled_gemm", "grouped_gemm_nt_masked"),
+        ("flashinfer", "scaled_fp4_grouped_quantize"),
+        ("flashinfer", "silu_and_scaled_nvfp4_experts_quantize"),
+    ]
+
+    for module_name, attr_name in required_functions:
+        mod = _get_submodule(module_name)
+        if not mod or not hasattr(mod, attr_name):
+            return False
+    return True
+
+
 @functools.cache
 def has_nvidia_artifactory() -> bool:
     """Return `True` if NVIDIA's artifactory is accessible.
@@ -259,6 +297,7 @@ def use_trtllm_attention(
     num_kv_heads: int,
     num_tokens: int,
     max_seq_len: int,
+    dcp_world_size: int,
     kv_cache_dtype: str,
     q_dtype: torch.dtype,
     is_prefill: bool,
@@ -272,6 +311,14 @@ def use_trtllm_attention(
     if force_use_trtllm is not None and not force_use_trtllm:
         return False
 
+    # Decode context parallel is not supported
+    if dcp_world_size > 1:
+        logger.warning_once(
+            "Trtllm does not support returning LSE and as a result "
+            "does not support DCP, reverting to FlashInfer"
+        )
+        return False
+
     # The platform is not supported
     if not supports_trtllm_attention():
         if force_use_trtllm:
@@ -310,14 +357,12 @@ def use_trtllm_attention(
         # Environment variable not set - use auto-detection
         if is_prefill:
             # Prefill auto-detection
-            use_trtllm = max_seq_len <= 131072 and kv_cache_dtype == "auto"
+            use_trtllm = kv_cache_dtype == "auto"
             if use_trtllm:
                 logger.warning_once("Using TRTLLM prefill attention (auto-detected).")
         else:
             # Decode auto-detection
-            use_trtllm = (
-                num_tokens <= 256 and max_seq_len <= 131072 and kv_cache_dtype == "auto"
-            )
+            use_trtllm = num_tokens <= 256 and kv_cache_dtype == "auto"
             if use_trtllm:
                 logger.warning_once("Using TRTLLM decode attention (auto-detected).")
         return use_trtllm
@@ -465,7 +510,10 @@ def flashinfer_disable_q_quantization() -> bool:
     "has_flashinfer",
     "flashinfer_trtllm_fp8_block_scale_moe",
     "flashinfer_cutlass_fused_moe",
+    "flashinfer_cutedsl_grouped_gemm_nt_masked",
     "flashinfer_fp4_quantize",
+    "silu_and_mul_scaled_nvfp4_experts_quantize",
+    "scaled_fp4_grouped_quantize",
     "nvfp4_block_scale_interleave",
     "trtllm_fp4_block_scale_moe",
     "autotune",
@@ -473,6 +521,7 @@ def flashinfer_disable_q_quantization() -> bool:
     "has_flashinfer_comm",
     "has_flashinfer_all2all",
     "has_flashinfer_cutlass_fused_moe",
+    "has_flashinfer_cutedsl_grouped_gemm_nt_masked",
     "has_nvidia_artifactory",
     "supports_trtllm_attention",
     "can_use_trtllm_attention",
diff --git a/vllm/utils/gc_utils.py b/vllm/utils/gc_utils.py
index 160ac9ac263a..25c8653e0e03 100644
--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@@ -17,7 +17,7 @@ class GCDebugConfig:
     """
     Config for GC Debugger.
     - 0: disable GC debugger
-    - 1: enable GC debugger with gc.collect elpased times
+    - 1: enable GC debugger with gc.collect elapsed times
     - '{"top_objects":5}': enable GC debugger with top 5 collected objects
     """
 
@@ -53,6 +53,7 @@ def __init__(self, config: GCDebugConfig) -> None:
         self.config = config
         # Start time in micro second of this GC cycle
         self.start_time_ns: int = time.monotonic_ns()
+        self.num_objects: int = 0
         # If config.top_objects is positive,
         # compute top collected objects by object types
         self.gc_top_collected_objects: str = ""
@@ -68,8 +69,10 @@ def handle(self, phase: str, info: dict[str, int]) -> None:
             # Before GC started, record GC start time
             # and top collected objects
             self.start_time_ns = time.monotonic_ns()
+            objects = gc.get_objects(generation)
+            self.num_objects = len(objects)
             self.gc_top_collected_objects = _compute_top_gc_collected_objects(
-                gc.get_objects(generation), self.config.top_objects
+                objects, self.config.top_objects
             )
         elif phase == "stop":
             # After GC finished, Record GC elapsed time and
@@ -77,9 +80,10 @@ def handle(self, phase: str, info: dict[str, int]) -> None:
             elpased_ms = (time.monotonic_ns() - self.start_time_ns) / 1e6
             logger.info(
                 "GC took %.3fms to complete. "
-                "Collected %s objects in GC generation %d.%s",
+                "Collected %s objects (out of %d) in GC generation %d.%s",
                 elpased_ms,
                 str(info.get("collected", "?")),
+                self.num_objects,
                 generation,
                 (
                     f" Top collected objects: \n{self.gc_top_collected_objects}"
diff --git a/vllm/utils/hashing.py b/vllm/utils/hashing.py
index 49f4f13d115f..edf1e9cb34e5 100644
--- a/vllm/utils/hashing.py
+++ b/vllm/utils/hashing.py
@@ -5,6 +5,7 @@
 
 import hashlib
 import pickle
+from _hashlib import HASH, UnsupportedDigestmodError
 from collections.abc import Callable
 from typing import Any
 
@@ -61,3 +62,20 @@ def get_hash_fn_by_name(hash_fn_name: str) -> Callable[[Any], bytes]:
         return sha256_cbor
 
     raise ValueError(f"Unsupported hash function: {hash_fn_name}")
+
+
+def safe_hash(data: bytes, usedforsecurity: bool = True) -> HASH:
+    """Hash for configs, defaulting to md5 but falling back to sha256
+    in FIPS constrained environments.
+
+    Args:
+        data: bytes
+        usedforsecurity: Whether the hash is used for security purposes
+
+    Returns:
+        Hash object
+    """
+    try:
+        return hashlib.md5(data, usedforsecurity=usedforsecurity)
+    except (UnsupportedDigestmodError, ValueError):
+        return hashlib.sha256(data)
diff --git a/vllm/utils/import_utils.py b/vllm/utils/import_utils.py
index f01d2c7a6a33..ff0f0350fd94 100644
--- a/vllm/utils/import_utils.py
+++ b/vllm/utils/import_utils.py
@@ -18,6 +18,10 @@
 import regex as re
 from typing_extensions import Never
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 
 # TODO: This function can be removed if transformer_modules classes are
 # serialized by value when communicating between processes
@@ -62,6 +66,35 @@ def import_pynvml():
     return pynvml
 
 
+@cache
+def import_triton_kernels():
+    """
+    For convenience, prioritize triton_kernels that is available in
+    `site-packages`. Use `vllm.third_party.triton_kernels` as a fall-back.
+    """
+    if _has_module("triton_kernels"):
+        import triton_kernels
+
+        logger.debug_once(
+            f"Loading module triton_kernels from {triton_kernels.__file__}.",
+            scope="local",
+        )
+    elif _has_module("vllm.third_party.triton_kernels"):
+        import vllm.third_party.triton_kernels as triton_kernels
+
+        logger.debug_once(
+            f"Loading module triton_kernels from {triton_kernels.__file__}.",
+            scope="local",
+        )
+        sys.modules["triton_kernels"] = triton_kernels
+    else:
+        logger.info_once(
+            "triton_kernels unavailable in this build. "
+            "Please consider installing triton_kernels from "
+            "https://github.com/triton-lang/triton/tree/main/python/triton_kernels"
+        )
+
+
 def import_from_path(module_name: str, file_path: str | os.PathLike):
     """
     Import a Python file according to its file path.
@@ -397,7 +430,12 @@ def has_deep_gemm() -> bool:
 
 def has_triton_kernels() -> bool:
     """Whether the optional `triton_kernels` package is available."""
-    return _has_module("triton_kernels")
+    is_available = _has_module("triton_kernels") or _has_module(
+        "vllm.third_party.triton_kernels"
+    )
+    if is_available:
+        import_triton_kernels()
+    return is_available
 
 
 def has_tilelang() -> bool:
diff --git a/vllm/utils/jsontree.py b/vllm/utils/jsontree.py
index cde9aa6ff901..fe757c2f3374 100644
--- a/vllm/utils/jsontree.py
+++ b/vllm/utils/jsontree.py
@@ -4,7 +4,7 @@
 
 from collections.abc import Callable, Iterable
 from functools import reduce
-from typing import TYPE_CHECKING, TypeAlias, TypeVar, cast, overload
+from typing import TYPE_CHECKING, Any, TypeAlias, TypeVar, overload
 
 if TYPE_CHECKING:
     import torch
@@ -82,16 +82,13 @@ def json_map_leaves(
 
 def json_map_leaves(
     func: Callable[[_T], _U],
-    value: "BatchedTensorInputs" | _JSONTree[_T],
+    value: Any,
 ) -> "BatchedTensorInputs" | _JSONTree[_U]:
     """Apply a function to each leaf in a nested JSON structure."""
     if isinstance(value, dict):
-        return {
-            k: json_map_leaves(func, v)  # type: ignore[arg-type]
-            for k, v in value.items()
-        }
+        return {k: json_map_leaves(func, v) for k, v in value.items()}  # type: ignore
     elif isinstance(value, list):
-        return [json_map_leaves(func, v) for v in value]
+        return [json_map_leaves(func, v) for v in value]  # type: ignore
     elif isinstance(value, tuple):
         return tuple(json_map_leaves(func, v) for v in value)
     else:
@@ -140,9 +137,9 @@ def json_reduce_leaves(
 
 
 def json_reduce_leaves(
-    func: Callable[..., _T | _U],
+    func: Callable[[_T, _T], _T] | Callable[[_U, _T], _U],
     value: _JSONTree[_T],
-    initial: _U = cast(_U, ...),  # noqa: B008
+    initial: _U = ...,  # type: ignore[assignment]
     /,
 ) -> _T | _U:
     """
@@ -151,13 +148,9 @@ def json_reduce_leaves(
     sequence to a single value.
     """
     if initial is ...:
-        return reduce(func, json_iter_leaves(value))  # type: ignore[arg-type]
+        return reduce(func, json_iter_leaves(value))  # type: ignore
 
-    return reduce(
-        func,  # type: ignore[arg-type]
-        json_iter_leaves(value),
-        initial,
-    )
+    return reduce(func, json_iter_leaves(value), initial)  # type: ignore
 
 
 def json_count_leaves(value: JSONTree[_T]) -> int:
diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
index c6a6757bed3b..e2517b935bf2 100644
--- a/vllm/utils/mem_utils.py
+++ b/vllm/utils/mem_utils.py
@@ -68,11 +68,11 @@ class MemorySnapshot:
     timestamp: float = 0.0
     auto_measure: bool = True
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         if self.auto_measure:
             self.measure()
 
-    def measure(self):
+    def measure(self) -> None:
         from vllm.platforms import current_platform
 
         # we measure the torch peak memory usage via allocated_bytes,
diff --git a/vllm/utils/nccl.py b/vllm/utils/nccl.py
index b1459fcbd246..4807bc076f82 100644
--- a/vllm/utils/nccl.py
+++ b/vllm/utils/nccl.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-import importlib
+import importlib.util
 import os
 
 import torch
@@ -47,8 +47,8 @@ def find_nccl_include_paths() -> list[str] | None:
 
     try:
         spec = importlib.util.find_spec("nvidia.nccl")
-        if spec and getattr(spec, "submodule_search_locations", None):
-            for loc in spec.submodule_search_locations:
+        if spec and (locs := getattr(spec, "submodule_search_locations", None)):
+            for loc in locs:
                 inc_dir = os.path.join(loc, "include")
                 if os.path.exists(os.path.join(inc_dir, "nccl.h")):
                     paths.append(inc_dir)
diff --git a/vllm/utils/network_utils.py b/vllm/utils/network_utils.py
index 0a68e48ba5e7..80ff0df28c66 100644
--- a/vllm/utils/network_utils.py
+++ b/vllm/utils/network_utils.py
@@ -72,7 +72,7 @@ def get_ip() -> str:
     return "0.0.0.0"
 
 
-def test_loopback_bind(address, family):
+def test_loopback_bind(address: str, family: int) -> bool:
     try:
         s = socket.socket(family, socket.SOCK_DGRAM)
         s.bind((address, 0))  # Port 0 = auto assign
diff --git a/vllm/utils/registry.py b/vllm/utils/registry.py
index ac9b859159ea..a136d450e7b1 100644
--- a/vllm/utils/registry.py
+++ b/vllm/utils/registry.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Any
+from typing import Any, TypeVar
+
+_T = TypeVar("_T", bound=type)
 
 
 class ExtensionManager:
@@ -34,7 +36,7 @@ def register(self, name: str):
         Decorator to register a class with the given name.
         """
 
-        def wrap(cls_to_register):
+        def wrap(cls_to_register: _T) -> _T:
             self.name2class[name] = cls_to_register
             return cls_to_register
 
diff --git a/vllm/utils/system_utils.py b/vllm/utils/system_utils.py
index 5968884e232a..a4eb8f4d4fd7 100644
--- a/vllm/utils/system_utils.py
+++ b/vllm/utils/system_utils.py
@@ -22,7 +22,7 @@
 
 logger = init_logger(__name__)
 
-CYAN = "\033[1;36m"
+CYAN = "\033[0;36m"
 RESET = "\033[0;0m"
 
 
@@ -56,6 +56,39 @@ def set_env_var(key: str, value: str) -> Iterator[None]:
             os.environ[key] = old
 
 
+@contextlib.contextmanager
+def suppress_stdout():
+    """
+    Suppress stdout from C libraries at the file descriptor level.
+
+    Only suppresses stdout, not stderr, to preserve error messages.
+    Suppression is disabled when VLLM_LOGGING_LEVEL is set to DEBUG.
+
+    Example:
+        with suppress_stdout():
+            # C library calls that would normally print to stdout
+            torch.distributed.new_group(ranks, backend="gloo")
+    """
+    # Don't suppress if logging level is DEBUG
+    if envs.VLLM_LOGGING_LEVEL == "DEBUG":
+        yield
+        return
+
+    stdout_fd = sys.stdout.fileno()
+    stdout_dup = os.dup(stdout_fd)
+    devnull_fd = os.open(os.devnull, os.O_WRONLY)
+
+    try:
+        sys.stdout.flush()
+        os.dup2(devnull_fd, stdout_fd)
+        yield
+    finally:
+        sys.stdout.flush()
+        os.dup2(stdout_dup, stdout_fd)
+        os.close(stdout_dup)
+        os.close(devnull_fd)
+
+
 # File path utilities
 
 
@@ -142,7 +175,10 @@ def set_process_title(
 
 def _add_prefix(file: TextIO, worker_name: str, pid: int) -> None:
     """Add colored prefix to file output for log decoration."""
-    prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
+    if envs.NO_COLOR:
+        prefix = f"({worker_name} pid={pid}) "
+    else:
+        prefix = f"{CYAN}({worker_name} pid={pid}){RESET} "
     file_write = file.write
 
     def write_with_prefix(s: str):
diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py
index 7c094e14cff7..f5c49ac169f0 100644
--- a/vllm/utils/torch_utils.py
+++ b/vllm/utils/torch_utils.py
@@ -13,7 +13,7 @@
 import torch
 from packaging import version
 from packaging.version import Version
-from torch.library import Library
+from torch.library import Library, infer_schema
 
 import vllm.envs as envs
 
@@ -78,7 +78,6 @@ def guard_cuda_initialization():
         yield
         return
 
-    had_key = "CUDA_VISIBLE_DEVICES" in os.environ
     old_value = os.environ.get("CUDA_VISIBLE_DEVICES")
     os.environ["CUDA_VISIBLE_DEVICES"] = ""
     try:
@@ -90,10 +89,10 @@ def guard_cuda_initialization():
             err_msg = str(e)
         raise RuntimeError(err_msg) from e
     finally:
-        if had_key:
-            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
+        if old_value is None:
+            del os.environ["CUDA_VISIBLE_DEVICES"]
         else:
-            os.environ.pop("CUDA_VISIBLE_DEVICES")
+            os.environ["CUDA_VISIBLE_DEVICES"] = old_value
 
 
 def get_dtype_size(dtype: torch.dtype) -> int:
@@ -426,8 +425,7 @@ def aux_stream() -> torch.cuda.Stream | None:
 
     from vllm.platforms import current_platform
 
-    # TODO: validate this works properly on ROCm platform.
-    if _aux_stream is None and current_platform.is_cuda():
+    if _aux_stream is None and current_platform.is_cuda_alike():
         _aux_stream = torch.cuda.Stream()
 
     return _aux_stream
@@ -526,8 +524,7 @@ def get_cuda_view_from_cpu_tensor(cpu_tensor: torch.Tensor) -> torch.Tensor:
 
 # Helper function used in testing.
 def _is_torch_equal_or_newer(torch_version: str, target: str) -> bool:
-    torch_version = version.parse(torch_version)
-    return torch_version >= version.parse(target)
+    return version.parse(torch_version) >= version.parse(target)
 
 
 def is_torch_equal_or_newer(target: str) -> bool:
@@ -641,15 +638,8 @@ def direct_register_custom_op(
 
         dispatch_key = current_platform.dispatch_key
 
-    import torch.library
-
-    if hasattr(torch.library, "infer_schema"):
-        schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
-    else:
-        # for pytorch 2.4
-        import torch._custom_op.impl
+    schema_str = infer_schema(op_func, mutates_args=mutates_args)
 
-        schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
     my_lib = target_lib or vllm_lib
     my_lib.define(op_name + schema_str, tags=tags)
     my_lib.impl(op_name, op_func, dispatch_key=dispatch_key)
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
index 674398e19c4c..fed7dcdf293b 100644
--- a/vllm/v1/attention/backends/cpu_attn.py
+++ b/vllm/v1/attention/backends/cpu_attn.py
@@ -25,7 +25,7 @@
 
 logger = init_logger(__name__)
 
-_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86,)
+_CPU_ARCH_PREFER_MIXED_BATCH = (CpuArchEnum.X86, CpuArchEnum.ARM)
 
 
 class CPUAttentionBackend(AttentionBackend):
@@ -48,6 +48,15 @@ def get_supported_head_sizes(cls) -> list[int]:
     def get_name() -> str:
         return "CPU_ATTN"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """CPU attention supports decoder and encoder-only attention."""
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+        )
+
     @staticmethod
     def get_impl_cls() -> type["CPUAttentionBackendImpl"]:
         return CPUAttentionBackendImpl
@@ -480,6 +489,9 @@ def _get_attn_isa(dtype: torch.dtype, block_size: int) -> str:
     if supports_amx and dtype in (torch.bfloat16,) and block_size % 32 == 0:
         return "amx"
     elif block_size % 32 == 0:
-        return "vec"
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            return "neon"
+        else:
+            return "vec"
     else:
         return "vec16"
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index d9bd52d8f980..fb080b0b33bc 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -32,7 +32,7 @@
         get_scheduler_metadata,
         reshape_and_cache_flash,
     )
-from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.config import VllmConfig, get_current_vllm_config, get_layers_from_vllm_config
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
@@ -56,16 +56,41 @@
 class FlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    # NOTE(tdoublep): while in principle, FA supports
-    # MultipleOf(16), these are the block sizes that do not
-    # suffer from the NaN propagation problem described here:
-    # https://github.com/Dao-AILab/flash-attention/issues/1974
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        vllm_config = get_current_vllm_config()
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        if (
+            model_config
+            and model_config.is_hybrid
+            and (
+                cache_config.mamba_ssm_cache_dtype == "float32"
+                or cache_config.mamba_cache_dtype == "float32"
+            )
+        ):
+            # NOTE(tdoublep): while in principle, FA supports
+            # MultipleOf(16), these are the block sizes that do not
+            # suffer from the NaN propagation problem described here:
+            # https://github.com/Dao-AILab/flash-attention/issues/1974
+            return [16, 32, 64]
+        return [MultipleOf(16)]
 
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlashAttention supports all attention types."""
+        return attn_type in (
+            AttentionType.DECODER,
+            AttentionType.ENCODER,
+            AttentionType.ENCODER_ONLY,
+            AttentionType.ENCODER_DECODER,
+        )
+
     @staticmethod
     def get_impl_cls() -> type["FlashAttentionImpl"]:
         return FlashAttentionImpl
@@ -87,12 +112,20 @@ def get_kv_cache_shape(
         return (2, num_blocks, block_size, num_kv_heads, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
         # `stride_order` indicates the permutation that gets
         # us from `get_kv_cache_shape` to the actual memory layout we want.
         cache_layout = get_kv_cache_layout()
-        if cache_layout == "NHD":
+        if cache_layout == "NHD" and include_num_layers_dimension:
+            # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size)
+            return (2, 0, 1, 3, 4, 5)
+        elif cache_layout == "NHD":
             stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND" and include_num_layers_dimension:
+            # (num_blocks, num_kv_heads, num_layers, 2, block_size, head_size)
+            return (2, 4, 0, 1, 3, 5)
         elif cache_layout == "HND":
             stride_order = (0, 1, 3, 2, 4)
         else:
@@ -107,8 +140,8 @@ def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype:
             raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}")
 
     @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [32, 64, 96, 128, 160, 192, 224, 256]
+    def supports_head_size(cls, head_size: int) -> bool:
+        return head_size % 8 == 0 and head_size <= 256
 
     @classmethod
     def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:
@@ -118,6 +151,12 @@ def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:
             return flash_attn_supports_fp8()
         return kv_cache_dtype in ["auto"]
 
+    @classmethod
+    def supports_sink(cls) -> bool:
+        if not is_flash_attn_varlen_func_available():
+            return False
+        return flash_attn_supports_sinks()
+
     @classmethod
     def supports_compute_capability(cls, capability: DeviceCapability) -> bool:
         return capability >= DeviceCapability(8, 0)
@@ -247,8 +286,8 @@ def __init__(
             self.dcp_world_size = 1
             self.dcp_rank = 0
 
-        self.dcp_kv_cache_interleave_size = (
-            self.parallel_config.dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size = (
+            self.parallel_config.cp_kv_cache_interleave_size
         )
 
         self.use_full_cuda_graph = (
@@ -287,7 +326,6 @@ def build(
         max_seq_len = common_attn_metadata.max_seq_len
         query_start_loc = common_attn_metadata.query_start_loc
         seq_lens = common_attn_metadata.seq_lens
-        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
         causal = common_attn_metadata.causal
@@ -360,20 +398,23 @@ def schedule(
         prefix_scheduler_metadata = None
 
         if self.dcp_world_size > 1:
-            query_kv_lens_cpu = (
-                common_attn_metadata.query_start_loc_cpu[1:]
-                - common_attn_metadata.query_start_loc_cpu[:-1]
-            )
-            dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu
+            query_kv_lens = query_start_loc[1:] - query_start_loc[:-1]
+            dcp_context_kv_lens = seq_lens - query_kv_lens
 
-            dcp_context_kv_lens_cpu = get_dcp_local_seq_lens(
-                dcp_context_kv_lens_cpu,
+            dcp_context_kv_lens = get_dcp_local_seq_lens(
+                dcp_context_kv_lens,
                 self.dcp_world_size,
                 self.dcp_rank,
-                self.dcp_kv_cache_interleave_size,
+                self.cp_kv_cache_interleave_size,
             )
-            dcp_context_kv_lens = dcp_context_kv_lens_cpu.to(self.device)
-            max_dcp_context_kv_len = dcp_context_kv_lens.max().item()
+            # After DCP distribution, the maximum number of tokens for any rank is
+            # ceil(L / (N * I)) * I, where L is max_seq_len, N is dcp_world_size,
+            # and I is cp_kv_cache_interleave_size.
+            # This eliminates GPU->CPU sync while minimizing workspace over-allocation.
+            num_partitions = self.dcp_world_size * self.cp_kv_cache_interleave_size
+            max_dcp_context_kv_len = (
+                (max_seq_len + num_partitions - 1) // num_partitions
+            ) * self.cp_kv_cache_interleave_size
 
             scheduler_metadata = schedule(
                 batch_size=num_reqs,
@@ -390,9 +431,8 @@ def schedule(
             prefix_kv_lens = torch.tensor(
                 [common_prefix_len], dtype=torch.int32, device=self.device
             )
-            suffix_kv_lens = (seq_lens_cpu[:num_reqs] - common_prefix_len).to(
-                self.device, non_blocking=True
-            )
+            # Use GPU tensor directly - no CPU sync needed
+            suffix_kv_lens = seq_lens[:num_reqs] - common_prefix_len
             prefix_scheduler_metadata = schedule(
                 batch_size=1,
                 cu_query_lens=cu_prefix_query_lens,
@@ -686,6 +726,7 @@ def forward(
             logits_soft_cap=self.logits_soft_cap,
             block_table=attn_metadata.block_table,
             common_prefix_len=attn_metadata.common_prefix_len,
+            max_num_splits=attn_metadata.max_num_splits,
             fa_version=self.vllm_flash_attn_version,
             prefix_scheduler_metadata=attn_metadata.prefix_scheduler_metadata,
             suffix_scheduler_metadata=attn_metadata.scheduler_metadata,
@@ -932,6 +973,7 @@ def cascade_attention(
     logits_soft_cap: float,
     block_table: torch.Tensor,
     common_prefix_len: int,
+    max_num_splits: int,
     fa_version: int,
     prefix_scheduler_metadata: torch.Tensor | None = None,
     suffix_scheduler_metadata: torch.Tensor | None = None,
@@ -976,7 +1018,7 @@ def cascade_attention(
         # s_aux is incorporated into prefix_lse inside the GPU kernel,
         # enabling its effect during the final attention merge.
         s_aux=s_aux,
-        num_splits=1 if vllm_is_batch_invariant() else 0,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     descale_shape = (cu_query_lens.shape[0] - 1, key_cache.shape[-2])
@@ -1001,7 +1043,7 @@ def cascade_attention(
         q_descale=q_descale.expand(descale_shape) if q_descale is not None else None,
         k_descale=k_descale.expand(descale_shape) if k_descale is not None else None,
         v_descale=v_descale.expand(descale_shape) if v_descale is not None else None,
-        num_splits=1 if vllm_is_batch_invariant() else 0,
+        num_splits=1 if vllm_is_batch_invariant() else max_num_splits,
     )
 
     # Merge prefix and suffix outputs, and store the result in output.
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index 0b650e2e0d33..69a6a5e5fae8 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -10,12 +10,12 @@
 from flashinfer import (
     BatchDecodeWithPagedKVCacheWrapper,
     BatchPrefillWithPagedKVCacheWrapper,
+    BatchPrefillWithRaggedKVCacheWrapper,
     MultiLevelCascadeAttentionWrapper,
 )
 from flashinfer.decode import _get_range_buf, trtllm_batch_decode_with_kv_cache
 from flashinfer.prefill import trtllm_batch_context_with_kv_cache
 from flashinfer.utils import FP4Tensor
-from typing_extensions import override
 
 from vllm import envs
 from vllm.attention.backends.abstract import (
@@ -24,8 +24,11 @@
     AttentionType,
     MultipleOf,
 )
+from vllm.attention.ops.common import cp_lse_ag_out_rs
+from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.config.cache import CacheDType
+from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
 from vllm.model_executor.layers.batch_invariant import (
     vllm_is_batch_invariant,
@@ -50,6 +53,7 @@
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     KVCacheLayoutType,
+    get_dcp_local_seq_lens,
     get_kv_cache_layout,
     get_per_layer_parameters,
     infer_global_hyperparameters,
@@ -160,13 +164,120 @@ def trtllm_prefill_attn_kvfp8_dequant(
     return mock_kv_cache, mock_block_table
 
 
+class BatchDCPPrefillWrapper:
+    def __init__(
+        self,
+        workspace_buffer: torch.Tensor | None = None,
+    ):
+        self._context = BatchPrefillWithPagedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
+        self._new_tokens = BatchPrefillWithRaggedKVCacheWrapper(
+            workspace_buffer, get_kv_cache_layout()
+        )
+
+    def plan(
+        self,
+        qo_indptr_cpu: torch.Tensor,
+        paged_kv_indptr_cpu: torch.Tensor,
+        paged_kv_indices: torch.Tensor,
+        paged_kv_last_page_len_cpu: torch.Tensor,
+        prefill_start: int,
+        page_size: int,
+        num_qo_heads: int,
+        dcp_world_size: int,
+        num_kv_heads: int,
+        head_dim: int,
+        sm_scale: float,
+        window_left: int,
+        logits_soft_cap: float | None,
+        q_data_type: torch.dtype,
+        kv_cache_dtype: torch.dtype,
+        prefill_fixed_split_size: int,
+        disable_split_kv: bool,
+    ):
+        """Plan the prefill operation with given parameters."""
+        self._context.plan(
+            qo_indptr_cpu,
+            paged_kv_indptr_cpu,
+            paged_kv_indices,
+            paged_kv_last_page_len_cpu[prefill_start:],
+            num_qo_heads * dcp_world_size,
+            num_kv_heads,
+            head_dim,
+            page_size,
+            causal=False,  # This is context run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+            kv_data_type=kv_cache_dtype,
+            fixed_split_size=prefill_fixed_split_size,
+            disable_split_kv=disable_split_kv,
+        )
+        self._new_tokens.plan(
+            qo_indptr=qo_indptr_cpu,
+            kv_indptr=qo_indptr_cpu,
+            num_qo_heads=num_qo_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim_qk=head_dim,
+            head_dim_vo=head_dim,
+            causal=True,  # This is newtokens run
+            sm_scale=sm_scale,
+            window_left=window_left,
+            logits_soft_cap=logits_soft_cap,
+            q_data_type=q_data_type,
+        )
+
+    def run(
+        self,
+        layer: torch.nn.Module,
+        prefill_query: torch.Tensor,
+        kv_cache_permute: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        out: torch.Tensor,
+    ):
+        prefill_query_across_dcp = get_dcp_group().all_gather(
+            prefill_query.contiguous(), dim=1
+        )
+        output_context_tmp, lse_context_tmp = self._context.run(
+            prefill_query_across_dcp,
+            kv_cache_permute,
+            k_scale=layer._k_scale_float,
+            v_scale=layer._v_scale_float,
+            return_lse=True,
+        )
+        output_context, lse_context = cp_lse_ag_out_rs(
+            output_context_tmp,
+            lse_context_tmp,
+            get_dcp_group(),
+            return_lse=True,
+            is_lse_base_on_e=False,
+        )
+        lse_context = lse_context.transpose(0, 1).contiguous()
+
+        output_query, lse_query = self._new_tokens.run(
+            prefill_query,
+            key,
+            value,
+            return_lse=True,
+        )
+        lse_query = lse_query.transpose(0, 1).contiguous()
+
+        merge_attn_states(
+            out,
+            output_context,
+            lse_context,
+            output_query,
+            lse_query,
+        )
+        return out
+
+
 class FlashInferBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    # Note: Not sure for all platforms,
-    # but on Blackwell, only support a page size of
-    # 16, 32, 64
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [16, 32, 64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
@@ -174,6 +285,12 @@ class FlashInferBackend(AttentionBackend):
         "fp8_e5m2",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        # Note: Not sure for all platforms, but on Blackwell,
+        # only support a page size of 16, 32, 64.
+        return [16, 32, 64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER"
@@ -197,12 +314,20 @@ def get_kv_cache_shape(
         return (num_blocks, 2, block_size, num_kv_heads, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
         # `stride_order` indicates the permutation that gets us from
         # `get_kv_cache_shape` to the actual memory layout we want.
         cache_layout = get_kv_cache_layout()
-        if cache_layout == "NHD":
+        if cache_layout == "NHD" and include_num_layers_dimension:
+            # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size)
+            return (1, 0, 2, 3, 4, 5)
+        elif cache_layout == "NHD":
             stride_order = (0, 1, 2, 3, 4)
+        elif cache_layout == "HND" and include_num_layers_dimension:
+            # (num_blocks, 2, num_kv_heads, num_layers, block_size, head_size)
+            return (1, 2, 4, 0, 3, 5)
         elif cache_layout == "HND":
             stride_order = (0, 1, 3, 2, 4)
         else:
@@ -281,7 +406,9 @@ class FlashInferMetadata:
     # For cascade attention (CPU for planning).
     use_cascade: bool
 
-    prefill_wrapper: BatchPrefillWithPagedKVCacheWrapper | None = None
+    prefill_wrapper: (
+        BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+    ) = None
     decode_wrapper: BatchDecodeWithPagedKVCacheWrapper | None = None
     cascade_wrapper: MultiLevelCascadeAttentionWrapper | None = None
 
@@ -303,7 +430,9 @@ def __init__(
         self.cache_config = vllm_config.cache_config
         self.model_config = vllm_config.model_config
         self._workspace_buffer = None
-        self._prefill_wrapper = None  # Wrapper for prefill/append
+        self._prefill_wrapper: (
+            BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None
+        ) = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode (general shape)
 
         if vllm_is_batch_invariant():
@@ -341,9 +470,23 @@ def __init__(
                 self.compilation_config.max_cudagraph_capture_size,
             )
 
-        self.num_qo_heads = self.model_config.get_num_attention_heads(
-            self.vllm_config.parallel_config
+        try:
+            self.dcp_world_size = get_dcp_group().world_size
+            self.dcp_rank = get_dcp_group().rank_in_group
+            self.dcp_kv_cache_interleave_size = (
+                vllm_config.parallel_config.dcp_kv_cache_interleave_size
+            )
+        except AssertionError:
+            # DCP might not be initialized in testing
+            self.dcp_world_size = 1
+            self.dcp_rank = 0
+            self.dcp_kv_cache_interleave_size = 1
+
+        self.num_qo_heads = (
+            self.model_config.get_num_attention_heads(self.vllm_config.parallel_config)
+            * self.dcp_world_size
         )
+
         self.num_kv_heads = self.kv_cache_spec.num_kv_heads
         self.head_dim = self.kv_cache_spec.head_size
         self.page_size = self.kv_cache_spec.block_size
@@ -428,7 +571,6 @@ def __init__(
             )
 
     @classmethod
-    @override
     def get_cudagraph_support(
         cls: type["FlashInferMetadataBuilder"],
         vllm_config: VllmConfig,
@@ -455,11 +597,22 @@ def _get_workspace_buffer(self):
             )
         return self._workspace_buffer
 
-    def _get_prefill_wrapper(self):
+    def set_workspace_buffer(self, workspace_buffer: torch.Tensor):
+        self._workspace_buffer = workspace_buffer
+
+    def _get_prefill_wrapper(
+        self,
+    ) -> BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper:
         if self._prefill_wrapper is None:
-            self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
-                self._get_workspace_buffer(), get_kv_cache_layout()
-            )
+            if self.dcp_world_size > 1:
+                self._prefill_wrapper = BatchDCPPrefillWrapper(
+                    workspace_buffer=self._get_workspace_buffer(),
+                )
+            else:
+                self._prefill_wrapper = BatchPrefillWithPagedKVCacheWrapper(
+                    self._get_workspace_buffer(), get_kv_cache_layout()
+                )
+        assert self._prefill_wrapper is not None
         return self._prefill_wrapper
 
     def _get_decode_wrapper(self, batch_size: int, use_cudagraph: bool = False):
@@ -526,9 +679,29 @@ def build(
         max_seq_len = common_attn_metadata.max_seq_len
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
-        seq_lens_np = seq_lens_cpu.numpy()
         block_table_tensor = common_attn_metadata.block_table_tensor
+        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
 
+        if self.dcp_world_size > 1:
+            if num_prefills > 0:
+                qo_indptr_prefill_cpu = (
+                    qo_indptr_cpu[num_decodes:] - qo_indptr_cpu[num_decodes]
+                )
+                query_lens_prefill_cpu = (
+                    qo_indptr_prefill_cpu[1:] - qo_indptr_prefill_cpu[:-1]
+                )
+                seq_lens_cpu[num_decodes:] = (
+                    seq_lens_cpu[num_decodes:] - query_lens_prefill_cpu
+                )
+
+            seq_lens_cpu = get_dcp_local_seq_lens(
+                seq_lens_cpu,
+                self.dcp_world_size,
+                self.dcp_rank,
+                self.dcp_kv_cache_interleave_size,
+            )
+
+        seq_lens_np = seq_lens_cpu.numpy()
         num_blocks_np = (seq_lens_np + (page_size - 1)) // page_size
 
         use_cascade = common_prefix_len > 0
@@ -589,7 +762,7 @@ def build(
         # write self.paged_kv_last_page_len_cpu inplace
         paged_kv_last_page_len_np = seq_lens_np % page_size
         self.paged_kv_last_page_len_np[:num_reqs] = np.where(
-            paged_kv_last_page_len_np == 0,
+            (paged_kv_last_page_len_np == 0) & (seq_lens_np != 0),
             page_size,
             paged_kv_last_page_len_np,
         )
@@ -600,13 +773,16 @@ def build(
             self.num_kv_heads,
             num_prefill_tokens,
             max_seq_len,
+            self.dcp_world_size,
             self.cache_dtype,
             self.q_data_type,
             is_prefill=True,
             has_sinks=self.has_sinks,
             has_spec=uses_spec_reorder,
         )
-        decode_use_trtllm = self.use_trtllm_decode_attention
+        decode_use_trtllm = (
+            self.use_trtllm_decode_attention and self.dcp_world_size <= 1
+        )
 
         if not (prefill_use_trtllm and decode_use_trtllm):
             if self.has_sinks:
@@ -651,7 +827,6 @@ def build(
             use_cascade=use_cascade,
         )
 
-        qo_indptr_cpu = common_attn_metadata.query_start_loc_cpu
         paged_kv_indptr_cpu = self.paged_kv_indptr_cpu[: 1 + num_reqs]
         paged_kv_last_page_len_cpu = self.paged_kv_last_page_len_cpu[:num_reqs]
 
@@ -703,24 +878,52 @@ def build(
                 attn_metadata.max_q_len_prefill = int(query_lens_prefill.max().item())
 
                 if not attn_metadata.prefill_use_trtllm:
-                    attn_metadata.prefill_wrapper.plan(
-                        qo_indptr_cpu,
-                        paged_kv_indptr_cpu,
-                        paged_kv_indices,
-                        paged_kv_last_page_len_cpu[prefill_start:],
-                        self.num_qo_heads,
-                        self.num_kv_heads,
-                        self.head_dim,
-                        self.page_size,
-                        causal=True,
-                        sm_scale=self.sm_scale,
-                        window_left=self.window_left,
-                        logits_soft_cap=self.logits_soft_cap,
-                        q_data_type=self.q_data_type,
-                        kv_data_type=self.kv_cache_dtype,
-                        fixed_split_size=self.prefill_fixed_split_size,
-                        disable_split_kv=self.disable_split_kv,
-                    )
+                    if self.dcp_world_size > 1:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper, BatchDCPPrefillWrapper
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu=qo_indptr_cpu,
+                            paged_kv_indptr_cpu=paged_kv_indptr_cpu,
+                            paged_kv_indices=paged_kv_indices,
+                            paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
+                            prefill_start=prefill_start,
+                            page_size=self.page_size,
+                            num_qo_heads=self.num_qo_heads,
+                            dcp_world_size=self.dcp_world_size,
+                            num_kv_heads=self.num_kv_heads,
+                            head_dim=self.head_dim,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_cache_dtype=self.kv_cache_dtype,
+                            prefill_fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
+                    else:
+                        assert isinstance(
+                            attn_metadata.prefill_wrapper,
+                            BatchPrefillWithPagedKVCacheWrapper,
+                        )
+                        attn_metadata.prefill_wrapper.plan(
+                            qo_indptr_cpu,
+                            paged_kv_indptr_cpu,
+                            paged_kv_indices,
+                            paged_kv_last_page_len_cpu[prefill_start:],
+                            self.num_qo_heads,
+                            self.num_kv_heads,
+                            self.head_dim,
+                            self.page_size,
+                            causal=True,
+                            sm_scale=self.sm_scale,
+                            window_left=self.window_left,
+                            logits_soft_cap=self.logits_soft_cap,
+                            q_data_type=self.q_data_type,
+                            kv_data_type=self.kv_cache_dtype,
+                            fixed_split_size=self.prefill_fixed_split_size,
+                            disable_split_kv=self.disable_split_kv,
+                        )
                 else:
                     attn_metadata.qo_indptr_gpu = qo_indptr_cpu.to(
                         self.device, non_blocking=True
@@ -731,31 +934,12 @@ def build(
 
             if num_decodes > 0:
                 pure_decode = num_prefills == 0
-                # possible required padding for cudagraph replay
                 use_cudagraph = (
                     self.enable_cuda_graph
                     and pure_decode
                     and num_decode_tokens <= self._decode_cudagraph_max_bs
                 )
-                if use_cudagraph:
-                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
-                        num_decode_tokens
-                    )
-                    # Carefully fulfill the padding region with reasonable value
-                    # on cpu.
-                    # Make sure paged_kv_indptr_cpu is not decreasing
-                    self.paged_kv_indptr_cpu[
-                        1 + num_decodes : 1 + num_input_tokens
-                    ].fill_(paged_kv_indptr_cpu[-1])
-                    # Fill the remaining paged_kv_last_page_len_cpu with 1.
-                    # This is because flashinfer treats 0 as a full page
-                    # instead of empty.
-                    self.paged_kv_last_page_len_cpu[num_decodes:num_input_tokens].fill_(
-                        1
-                    )
-
-                else:
-                    num_input_tokens = num_decode_tokens
+                num_input_tokens = num_decode_tokens
 
                 attn_metadata.decode_wrapper = self._get_decode_wrapper(
                     num_input_tokens, use_cudagraph
@@ -770,7 +954,7 @@ def build(
                         paged_kv_indices,
                         self.paged_kv_last_page_len_cpu[:num_input_tokens],
                         seq_lens_cpu[:num_input_tokens],
-                        self.num_qo_heads,
+                        self.num_qo_heads * self.dcp_world_size,
                         self.num_kv_heads,
                         self.head_dim,
                         self.page_size,
@@ -797,6 +981,8 @@ def use_cascade_attention(self, *args, **kwargs) -> bool:
 
 
 class FlashInferImpl(AttentionImpl):
+    can_return_lse_for_decode: bool = True
+
     def __init__(
         self,
         num_heads: int,
@@ -989,6 +1175,8 @@ def forward(
 
         # Inputs and outputs may be padded for CUDA graphs
         query = query[:num_actual_tokens]
+        key = key[:num_actual_tokens]
+        value = value[:num_actual_tokens]
         output_padded = output
         output = output[:num_actual_tokens]
 
@@ -1015,17 +1203,46 @@ def forward(
             assert prefill_wrapper is not None
 
             if not attn_metadata.prefill_use_trtllm:
-                assert prefill_wrapper._causal
-                assert prefill_wrapper._window_left == self.window_left
-                assert prefill_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
-                assert prefill_wrapper._sm_scale == self.scale
-                prefill_wrapper.run(
-                    prefill_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[num_decode_tokens:],
-                )
+                if self.dcp_world_size > 1:
+                    assert isinstance(prefill_wrapper, BatchDCPPrefillWrapper)
+                    assert prefill_wrapper._context._window_left == self.window_left
+                    assert prefill_wrapper._context._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._context._sm_scale == self.scale
+                    assert not prefill_wrapper._context._causal
+                    assert prefill_wrapper._new_tokens._window_left == self.window_left
+                    assert prefill_wrapper._new_tokens._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._new_tokens._sm_scale == self.scale
+                    assert prefill_wrapper._new_tokens._causal
+
+                    prefill_wrapper.run(
+                        layer,
+                        prefill_query,
+                        kv_cache_permute,
+                        key[num_decode_tokens:],
+                        value[num_decode_tokens:],
+                        out=output[num_decode_tokens:],
+                    )
+                else:
+                    assert isinstance(
+                        prefill_wrapper, BatchPrefillWithPagedKVCacheWrapper
+                    )
+                    assert prefill_wrapper._window_left == self.window_left
+                    assert prefill_wrapper._logits_soft_cap == (
+                        self.logits_soft_cap or 0.0
+                    )
+                    assert prefill_wrapper._sm_scale == self.scale
+                    assert prefill_wrapper._causal
+                    prefill_wrapper.run(
+                        prefill_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[num_decode_tokens:],
+                    )
             else:
                 # prefill_query may be non-contiguous
                 prefill_query = prefill_query.contiguous()
@@ -1101,13 +1318,40 @@ def forward(
                 assert decode_wrapper._window_left == self.window_left
                 assert decode_wrapper._logits_soft_cap == (self.logits_soft_cap or 0.0)
                 assert decode_wrapper._sm_scale == self.scale
-                decode_wrapper.run(
-                    decode_query,
-                    kv_cache_permute,
-                    k_scale=layer._k_scale_float,
-                    v_scale=layer._v_scale_float,
-                    out=output[:num_decode_tokens],
-                )
+
+                if self.dcp_world_size > 1:
+                    decode_query = get_dcp_group().all_gather(
+                        decode_query.contiguous(), dim=-2
+                    )
+                    output_tmp = torch.empty_like(decode_query)
+                    lse = torch.empty(
+                        (decode_query.size(0), decode_query.size(1)),
+                        dtype=torch.float32,
+                        device=decode_query.device,
+                    )
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output_tmp,
+                        lse=lse,
+                        return_lse=True,
+                    )
+                    output[:num_decode_tokens] = cp_lse_ag_out_rs(
+                        output_tmp,
+                        lse,
+                        get_dcp_group(),
+                        is_lse_base_on_e=False,
+                    )
+                else:
+                    decode_wrapper.run(
+                        decode_query,
+                        kv_cache_permute,
+                        k_scale=layer._k_scale_float,
+                        v_scale=layer._v_scale_float,
+                        out=output[:num_decode_tokens],
+                    )
             else:
                 # decode_query may be non-contiguous
                 decode_query = decode_query.contiguous()
@@ -1271,7 +1515,7 @@ def fast_plan_decode(
     qo_indptr_host = _get_range_buf(batch_size + 1, "cpu")
 
     try:
-        # Make sure we pass exactly 18 arguments for tensor core version
+        # Make sure we pass exactly 19 arguments for tensor core version
         self._plan_info = self._cached_module.plan(
             self._float_workspace_buffer,
             self._int_workspace_buffer,
@@ -1291,6 +1535,7 @@ def fast_plan_decode(
             window_left,
             fixed_split_size,
             disable_split_kv,
+            0,
         )
     except Exception as e:
         raise RuntimeError(f"Error in tensor core plan: {e}") from e
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
index e53cd0d8af4f..fe92f6570501 100644
--- a/vllm/v1/attention/backends/flex_attention.py
+++ b/vllm/v1/attention/backends/flex_attention.py
@@ -4,6 +4,7 @@
 
 import math
 from dataclasses import dataclass
+from functools import cached_property
 from typing import ClassVar
 
 import torch
@@ -84,6 +85,11 @@ class FlexAttentionBackend(AttentionBackend):
     def get_name() -> str:
         return "FLEX_ATTENTION"
 
+    @classmethod
+    def supports_attn_type(cls, attn_type: str) -> bool:
+        """FlexAttention supports both decoder and encoder-only attention."""
+        return attn_type in (AttentionType.DECODER, AttentionType.ENCODER_ONLY)
+
     @staticmethod
     def get_impl_cls() -> type["FlexAttentionImpl"]:
         return FlexAttentionImpl
@@ -310,6 +316,14 @@ class FlexAttentionMetadata:
     transformed_score_mod: _score_mod_signature | None = None
     sliding_window: int | None = None
 
+    @cached_property
+    def logical_block_ids(self):
+        return torch.arange(
+            cdiv(self.max_seq_len, self.block_size),
+            device=self.block_table.device,
+            dtype=torch.long,
+        )
+
     def _convert_physical_to_logical(
         self,
         request_lookup: torch.Tensor,
@@ -488,6 +502,7 @@ def _build_block_mask_direct(self) -> BlockMask:
 
         The direct path works as follows:
         1. For each query token, fetch blocks from block_table using max_seq_len
+           and exclude out of sliding window blocks if needed.
            (this fetches more blocks than needed for shorter sequences)
         2. Group query tokens into chunks of q_block_size
         3. For each group, deduplicate the blocks using unique_static_unsorted
@@ -512,6 +527,23 @@ def _build_block_mask_direct(self) -> BlockMask:
         used_pages = self.block_table[
             self.doc_ids, : cdiv(self.max_seq_len, self.block_size)
         ]
+
+        if self.sliding_window and self.causal:
+            device = used_pages.device
+            assert self.doc_ids is not None
+            token_indices = torch.arange(
+                self.doc_ids.shape[0], device=device, dtype=torch.long
+            )
+            logical_q_idx = (
+                token_indices
+                - self.query_start_loc[self.doc_ids]
+                + self.decode_offset[self.doc_ids]
+            )
+            min_kv_idx = torch.clamp(logical_q_idx - (self.sliding_window - 1), min=0)
+            min_block_idx = min_kv_idx // self.block_size
+            sliding_mask = self.logical_block_ids >= min_block_idx[:, None]
+            used_pages.masked_fill_(~sliding_mask, 0)
+
         used_pages_padded = pad_to_multiple(
             used_pages, multiple=self.q_block_size, dim=0
         )
@@ -780,12 +812,6 @@ def forward(
         if attn_metadata.sliding_window != self.sliding_window:
             attn_metadata.sliding_window = self.sliding_window
             if attn_metadata.direct_build:
-                # TODO: Support skipping the computation of sliding window
-                # in direct block mask building code path.
-                logger.warning_once(
-                    "Using direct block mask building with sliding window, "
-                    "which is suboptimal now. Performance may be degraded."
-                )
                 # update mask mod in attention metadata
                 attn_metadata.mask_mod = attn_metadata.get_mask_mod()
                 attn_metadata.block_mask = attn_metadata._build_block_mask_direct()
diff --git a/vllm/v1/attention/backends/linear_attn.py b/vllm/v1/attention/backends/linear_attn.py
index 1900c50849ec..004baa2d09cd 100644
--- a/vllm/v1/attention/backends/linear_attn.py
+++ b/vllm/v1/attention/backends/linear_attn.py
@@ -7,6 +7,7 @@
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
 from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
     split_decodes_and_prefills,
@@ -35,6 +36,8 @@ class LinearAttentionMetadata:
 class LinearAttentionMetadataBuilder(AttentionMetadataBuilder[LinearAttentionMetadata]):
     reorder_batch_threshold: int = 1
 
+    _cudagraph_support = AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
     def __init__(
         self,
         kv_cache_spec: AttentionSpec,
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index 0d875565fc99..a9705db59f19 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -107,6 +107,8 @@ def _compute_prefix_caching_block_indices(
         )
         # -1 in case it's non-computed and causes later issues with indexing
         block_idx_last_computed_token = block_idx_last_computed_token.clamp(min=0)
+        # -1 in the case we have a padded request (0 seq-len)
+        block_idx_last_scheduled_token = block_idx_last_scheduled_token.clamp(min=0)
 
         return (
             block_idx_last_computed_token,
diff --git a/vllm/v1/attention/backends/mla/aiter_triton_mla.py b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
new file mode 100644
index 000000000000..8a92152a0ca5
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/aiter_triton_mla.py
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm.v1.attention.backends.mla.common import MLACommonBackend
+from vllm.v1.attention.backends.mla.rocm_aiter_mla import (
+    AiterMLAImpl,
+    AiterMLAMetadataBuilder,
+)
+
+
+class AiterTritonMLABackend(MLACommonBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "AITER_TRITON_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> type["AiterTritonMLAImpl"]:
+        return AiterTritonMLAImpl
+
+    @staticmethod
+    def get_builder_cls() -> type["AiterMLAMetadataBuilder"]:
+        return AiterMLAMetadataBuilder
+
+
+class AiterTritonMLAImpl(AiterMLAImpl):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+        from aiter.ops.triton.mha import flash_attn_varlen_func
+
+        self.flash_attn_varlen_func = flash_attn_varlen_func
+
+    def _flash_attn_varlen_diff_headdims(
+        self, q, k, v, return_softmax_lse=False, softmax_scale=None, **kwargs
+    ):
+        result = self.flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            softmax_scale=softmax_scale,
+            return_lse=return_softmax_lse,
+            **kwargs,
+        )
+        # Transpose the LSE if Triton MHA is used:
+        # (q.shape[0], num_q_heads) to (num_q_heads, q.shape[0])
+        if type(result) is tuple and return_softmax_lse:
+            output, lse = result
+            lse = lse.T.contiguous()
+            return (output, lse)
+        return result
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
index 467c01cd9d06..180625b6ce89 100755
--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -308,6 +308,15 @@ def get_kv_cache_shape(
     ) -> tuple[int, ...]:
         return (num_blocks, block_size, head_size)
 
+    @staticmethod
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        # `stride_order` indicates the permutation that gets
+        # us from `get_kv_cache_shape` to the actual memory layout we want.
+        # (num_blocks, num_layers, block_size, head_size)
+        return (1, 0, 2, 3) if include_num_layers_dimension else (0, 1, 2)
+
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
         return [576]
@@ -331,12 +340,15 @@ class ChunkedContextMetadata:
         max_seq_lens: list[int]
         seq_lens: torch.Tensor
         workspace: torch.Tensor
+        token_to_seq: torch.Tensor
+        chunk_total_token: list[int]
 
         # for mla DCP
         padded_local_chunk_seq_lens: list[list[int]] | None = None
         local_context_lens_allranks: list[list[int]] | None = None
         padded_local_cu_seq_lens: torch.Tensor | None = None
         cu_seq_lens_lst: list[list[int]] | None = None
+        chunk_size: int | None = None
 
     block_table: torch.Tensor
     query_start_loc: torch.Tensor
@@ -535,7 +547,7 @@ def __init__(
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
-        self.dcp_local_block_size = parallel_config.dcp_kv_cache_interleave_size
+        self.dcp_local_block_size = parallel_config.cp_kv_cache_interleave_size
         self.dcp_virtual_block_size = self.dcp_local_block_size * self.dcp_world_size
 
         # Don't try to access the runner on AMD
@@ -754,6 +766,7 @@ def build(
         seq_lens = common_attn_metadata.seq_lens
         seq_lens_cpu = common_attn_metadata.seq_lens_cpu
         dcp_local_seq_lens = common_attn_metadata.dcp_local_seq_lens
+        dcp_local_seq_lens_cpu = common_attn_metadata.dcp_local_seq_lens_cpu
 
         query_seq_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
@@ -828,6 +841,19 @@ def build(
                 torch.cumsum(
                     chunk_seq_lens, dim=1, out=cu_seq_lens_cpu[:, 1:], dtype=torch.int32
                 )
+                chunk_total_token = cu_seq_lens_cpu[:, -1]
+
+                max_token_num_over_chunk = chunk_total_token.max().item()
+                token_to_seq_tensor_cpu = torch.zeros(
+                    [num_chunks, max_token_num_over_chunk], dtype=torch.int32
+                )
+                range_idx = torch.arange(num_prefills, dtype=torch.int32)
+                for i in range(num_chunks):
+                    chunk_token_to_seq_tensor = torch.repeat_interleave(
+                        range_idx, chunk_seq_lens[i]
+                    )
+                    chunk_len = chunk_token_to_seq_tensor.shape[0]
+                    token_to_seq_tensor_cpu[i, :chunk_len] = chunk_token_to_seq_tensor
 
                 if self.dcp_world_size > 1:
                     local_context_lens_allranks = get_dcp_local_seq_lens(
@@ -895,6 +921,10 @@ def build(
                         seq_tot=padded_local_chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token.tolist(),
                         workspace=self.chunked_prefill_workspace,
                         padded_local_chunk_seq_lens=padded_local_chunk_seq_lens.tolist(),
                         local_context_lens_allranks=local_context_lens_allranks.tolist(),
@@ -902,6 +932,7 @@ def build(
                             device, non_blocking=True
                         ),
                         cu_seq_lens_lst=cu_seq_lens_cpu.tolist(),
+                        chunk_size=padded_local_max_context_chunk_across_ranks,
                     )
                 else:
                     chunked_context_metadata = chunked_context_metadata_cls(
@@ -910,6 +941,10 @@ def build(
                         seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                         max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                         seq_lens=chunk_seq_lens,
+                        token_to_seq=token_to_seq_tensor_cpu.to(
+                            device, non_blocking=True
+                        ),
+                        chunk_total_token=chunk_total_token,
                         workspace=self.chunked_prefill_workspace,
                     )
 
@@ -942,18 +977,20 @@ def build(
 
         decode_metadata = None
         if num_decodes > 0:
+            dcp_tot_seq_lens_device = None
+            if self.dcp_world_size > 1:
+                dcp_tot_seq_lens_device = seq_lens[:num_decodes]
+                seq_lens_cpu = dcp_local_seq_lens_cpu
+                seq_lens = dcp_local_seq_lens
+
             decode_metadata = self._build_decode(
                 block_table_tensor=block_table_tensor[:num_decodes, ...],
                 seq_lens_cpu=seq_lens_cpu[:num_decodes],
-                seq_lens_device=dcp_local_seq_lens[:num_decodes]
-                if self.dcp_world_size > 1 and dcp_local_seq_lens is not None
-                else seq_lens[:num_decodes],
+                seq_lens_device=seq_lens[:num_decodes],
                 query_start_loc_cpu=query_start_loc_cpu[: num_decodes + 1],
                 query_start_loc_device=query_start_loc[: num_decodes + 1],
                 num_decode_tokens=num_decode_tokens,
-                dcp_tot_seq_lens_device=seq_lens[:num_decodes]
-                if self.dcp_world_size > 1
-                else None,
+                dcp_tot_seq_lens_device=dcp_tot_seq_lens_device,
             )
 
         attn_metadata = self.metadata_cls(
@@ -986,6 +1023,8 @@ def reorg_kvcache(
     local_context_lens_allranks: list[list[int]],
     sum_seq_len: int,
     max_seq_len: int,
+    chunk_size: int,
+    chunk_idx: int,
     toks: int,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
@@ -1001,6 +1040,9 @@ def reorg_kvcache(
         local_context_lens_allranks: local context lengths on each CP rank.
         sum_seq_len: the sum of cp_chunk_seq_lens_lst.
         max_seq_len: the max value of cp_chunk_seq_lens_lst.
+        chunk_size: the local padded max context chunk from
+            chunked_context_metadata building.
+        chunk_idx: chunk idx of chunked_prefill.
         toks: the number of tokens for local gather cache.
     """
     kv_c_segments = []
@@ -1012,20 +1054,31 @@ def reorg_kvcache(
     ):
         cur_seq_len = 0
         for rank, local_context_len in enumerate(local_context_lens):
-            if local_context_len != 0:
+            # Note(qcs): We split the context into multiple chunks,
+            # depending on the size of the workspace.
+            # local_context in dcp0:   |-----------------|
+            # local_context in dcp1:   |--------------|
+            # n*padded_local_chunk:    |-----|-----|-----|
+            # local_chunk_len in dcp1: |-----|-----|--|
+            # so we need update the last chunk length in dcp1.
+            local_chunk_len = min(
+                max(0, local_context_len - chunk_idx * chunk_size),
+                padded_local_chunk_seq_len,
+            )
+            if local_chunk_len != 0:
                 kv_c_segment = allgatered_kv_c_normed[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + local_context_len
+                    + local_chunk_len
                 ]
                 k_pe_segment = allgatered_k_pe[
                     rank * toks + src_token_idx : rank * toks
                     + src_token_idx
-                    + local_context_len
+                    + local_chunk_len
                 ]
                 kv_c_segments.append(kv_c_segment)
                 k_pe_segments.append(k_pe_segment)
-                cur_seq_len += local_context_len
+                cur_seq_len += local_chunk_len
         max_seq_len_check = max(max_seq_len_check, cur_seq_len)
         src_token_idx += padded_local_chunk_seq_len
     reorganized_kv_c_normed = torch.cat(kv_c_segments, dim=0)
@@ -1185,15 +1238,13 @@ def get_and_maybe_dequant_weights(layer: LinearBase):
     def _v_up_proj(self, x: torch.Tensor, out: torch.Tensor):
         # Convert from (B, N, L) to (N, B, L)
         x = x.view(-1, self.num_heads, self.kv_lora_rank).transpose(0, 1)
+
         if self.is_aiter_triton_fp8_bmm_enabled:
+            out = out.view(-1, self.num_heads, self.v_head_dim)
             # Multiply + Transpose (N, B, L) x (N, L, V)->(N, B, V)->(B, N, V)
             x = rocm_aiter_ops.triton_fp8_bmm(
-                x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True
+                x, self.W_V, self.W_V_scale, group_size=128, transpose_bm=True, YQ=out
             )
-            # Convert from (B, N, V) to (B, N * V)
-            x = x.reshape(-1, self.num_heads * self.v_head_dim)
-            # Copy result
-            out.copy_(x)
         else:
             # Convert from (B, N * V) to (N, B, V)
             out = out.view(-1, self.num_heads, self.v_head_dim).transpose(0, 1)
@@ -1268,8 +1319,8 @@ def __init__(self, *args, **kwargs) -> None:
                 get_current_vllm_config()
             )
         )
-        self.dcp_kv_cache_interleave_size: int = (
-            get_current_vllm_config().parallel_config.dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size: int = (
+            get_current_vllm_config().parallel_config.cp_kv_cache_interleave_size
         )
 
     def _flash_attn_varlen_diff_headdims(
@@ -1608,16 +1659,15 @@ def _compute_prefill_context(
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
         workspace = prefill_metadata.chunked_context.workspace
-
         for i in range(iters):
             toks = prefill_metadata.chunked_context.seq_tot[i]
-
             ops.gather_and_maybe_dequant_cache(
                 src_cache=kv_c_and_k_pe_cache,
                 dst=workspace,
                 block_table=prefill_metadata.block_table,
                 cu_seq_lens=prefill_metadata.chunked_context.cu_seq_lens[i],
-                batch_size=attn_metadata.num_prefills,
+                token_to_seq=prefill_metadata.chunked_context.token_to_seq[i],
+                num_tokens=prefill_metadata.chunked_context.chunk_total_token[i],
                 kv_cache_dtype=self.kv_cache_dtype,
                 scale=k_scale,
                 seq_starts=prefill_metadata.chunked_context.starts[i],
@@ -1676,6 +1726,7 @@ def _context_parallel_compute_prefill_context(
         assert prefill_metadata.chunked_context.local_context_lens_allranks is not None
         assert prefill_metadata.chunked_context.padded_local_cu_seq_lens is not None
         assert prefill_metadata.chunked_context.cu_seq_lens_lst is not None
+        assert prefill_metadata.chunked_context.chunk_size is not None
 
         output = None
         iters = len(prefill_metadata.chunked_context.seq_tot)
@@ -1725,6 +1776,8 @@ def _context_parallel_compute_prefill_context(
                 local_context_lens_allranks=prefill_metadata.chunked_context.local_context_lens_allranks,
                 sum_seq_len=prefill_metadata.chunked_context.cu_seq_lens_lst[i][-1],
                 max_seq_len=prefill_metadata.chunked_context.max_seq_lens[i],
+                chunk_size=prefill_metadata.chunked_context.chunk_size,
+                chunk_idx=i,
                 toks=toks,
             )
 
@@ -1769,7 +1822,8 @@ def _forward_prefill(
         kv_c_and_k_pe_cache: torch.Tensor,
         attn_metadata: MLACommonMetadata,
         k_scale: torch.Tensor,
-    ) -> torch.Tensor:
+        output: torch.Tensor,
+    ) -> None:
         # TODO (zyongye): Prefill function here
         assert attn_metadata.prefill is not None
         assert self.dcp_world_size is not None
@@ -1782,7 +1836,7 @@ def _forward_prefill(
 
         k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1)
 
-        output = self._run_prefill_new_tokens(
+        output_prefill = self._run_prefill_new_tokens(
             prefill=attn_metadata.prefill,
             q=q,
             k=k,
@@ -1791,7 +1845,7 @@ def _forward_prefill(
         )
 
         if has_context:
-            suffix_output, suffix_lse = output
+            suffix_output, suffix_lse = output_prefill
             if self.dcp_world_size > 1:
                 context_output, context_lse = (
                     self._context_parallel_compute_prefill_context(
@@ -1807,7 +1861,12 @@ def _forward_prefill(
                     q, kv_c_and_k_pe_cache, attn_metadata, k_scale
                 )
 
-            output = torch.empty_like(suffix_output)
+            # unpad if necessary
+            if self._pad_v:
+                context_output = context_output[..., : v.shape[-1]]
+                suffix_output = suffix_output[..., : v.shape[-1]]
+
+            output = output.view(-1, self.num_heads, self.v_head_dim)
             merge_attn_states(
                 output=output,
                 prefix_output=context_output,
@@ -1815,12 +1874,9 @@ def _forward_prefill(
                 suffix_output=suffix_output,
                 suffix_lse=suffix_lse,
             )
-
-        # unpad if necessary
-        if self._pad_v:
-            output = output[..., : v.shape[-1]]
-
-        return output.flatten(start_dim=-2)
+        else:
+            output_prefill = output_prefill[..., : v.shape[-1]].flatten(start_dim=-2)
+            output.copy_(output_prefill)
 
     @abstractmethod
     def _forward_decode(
@@ -1915,13 +1971,14 @@ def forward(
             kv_cache = kv_cache.view(current_platform.fp8_dtype())
 
         if has_prefill:
-            output[num_decode_tokens:] = self._forward_prefill(
+            self._forward_prefill(
                 prefill_q,
                 prefill_k_c_normed,
                 prefill_k_pe,
                 kv_cache,
                 attn_metadata,
                 layer._k_scale,
+                output=output[num_decode_tokens:],
             )
 
         if has_decode:
@@ -2000,7 +2057,12 @@ def forward(
 
             # correct dcp attn_out with lse.
             if self.dcp_world_size > 1:
-                attn_out = cp_lse_ag_out_rs(attn_out, lse, get_dcp_group())
+                attn_out = cp_lse_ag_out_rs(
+                    attn_out,
+                    lse,
+                    get_dcp_group(),
+                    is_lse_base_on_e=not getattr(self, "_use_fi_prefill", False),
+                )
 
             # v_up projection
             self._v_up_proj(attn_out, out=output[:num_decode_tokens])
diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py
index 60cb5022a55e..5e3fbc0abf08 100644
--- a/vllm/v1/attention/backends/mla/cutlass_mla.py
+++ b/vllm/v1/attention/backends/mla/cutlass_mla.py
@@ -36,13 +36,16 @@ class CutlassMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
 
 class CutlassMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [128]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [128]
+
     @staticmethod
     def get_name() -> str:
         return "CUTLASS_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py
index 7794e89cc0a9..d369814c10b6 100644
--- a/vllm/v1/attention/backends/mla/flashattn_mla.py
+++ b/vllm/v1/attention/backends/mla/flashattn_mla.py
@@ -41,9 +41,12 @@
 
 class FlashAttnMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
     @staticmethod
     def get_name() -> str:
         return "FLASH_ATTN_MLA"
@@ -173,7 +176,7 @@ def _build_decode(
     ) -> FlashAttnMLADecodeMetadata:
         query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         max_query_len = query_lens_cpu.max().item()
-        max_seq_len = seq_lens_device.max().item()
+        max_seq_len = seq_lens_cpu.max().item()
 
         # For Flash Attention MLA + full cudagraph
         max_num_splits = 0
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 52bb19e039e4..f02a4bb1ef35 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -35,13 +35,16 @@ class FlashInferMLAMetadataBuilder(MLACommonMetadataBuilder[MLACommonMetadata]):
 
 class FlashInferMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [32, 64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [32, 64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHINFER_MLA"
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
index 3aab1f9bb7fb..74a4cd843025 100644
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -39,13 +39,16 @@
 
 class FlashMLABackend(MLACommonBackend):
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
         "fp8_e4m3",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA"
diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py
index 5fe9c69d3500..1eee1d225293 100644
--- a/vllm/v1/attention/backends/mla/flashmla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py
@@ -40,14 +40,14 @@
 """
 NOTE: FlashMLA Sparse uses an fp8 cache with the following format
 
-In the "FP8 with scale" format, each token's KV cache is 656 Bytes, 
+In the "FP8 with scale" format, each token's KV cache is 656 Bytes,
 structured as:
--   **First 512 bytes:** The "quantized NoPE" part, containing 512 
+-   **First 512 bytes:** The "quantized NoPE" part, containing 512
     `float8_e4m3` values.
--   **Next 16 bytes:** Scale factors, containing 4 `float32` values. 
-    The first `float32` is the scale for the first 128 `float8_e4m3` values, 
+-   **Next 16 bytes:** Scale factors, containing 4 `float32` values.
+    The first `float32` is the scale for the first 128 `float8_e4m3` values,
     the second for the next 128, and so on.
--   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This 
+-   **Last 128 bytes:** The "RoPE" part, containing 64 `bfloat16` values. This
     part is not quantized for accuracy.
 """
 
@@ -55,9 +55,12 @@
 class FlashMLASparseBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = ["auto", "fp8_ds_mla"]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [64]
+
     @staticmethod
     def get_name() -> str:
         return "FLASHMLA_SPARSE"
@@ -168,7 +171,7 @@ def _convert_req_index_to_global_index_kernel(
     inblock_off = tok % BLOCK_SIZE
 
     # Guard block_table access
-    valid_block = block_id < max_num_blocks_per_req
+    valid_block = (block_id < max_num_blocks_per_req) & (block_id >= 0)
     bt_ptr = block_table_ptr + req * bt_stride0 + block_id * bt_stride1
     base = tl.load(bt_ptr, mask=valid_block, other=0)
 
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
index 37aa5dad89a0..77f1ba00d5b0 100644
--- a/vllm/v1/attention/backends/mla/indexer.py
+++ b/vllm/v1/attention/backends/mla/indexer.py
@@ -11,7 +11,8 @@
 )
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata
+from vllm.platforms import current_platform
+from vllm.utils.deep_gemm import get_paged_mqa_logits_metadata, is_deep_gemm_supported
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -23,7 +24,9 @@
 
 
 class DeepseekV32IndexerBackend(AttentionBackend):
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [64]
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [1 if current_platform.is_rocm() else 64]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -45,7 +48,11 @@ def get_kv_cache_shape(
         return (num_blocks, block_size, head_size)
 
     @staticmethod
-    def get_kv_cache_stride_order() -> tuple[int, ...]:
+    def get_kv_cache_stride_order(
+        include_num_layers_dimension: bool = False,
+    ) -> tuple[int, ...]:
+        if include_num_layers_dimension:
+            return (0, 1, 2, 3)
         return (0, 1, 2)
 
 
@@ -328,10 +335,10 @@ def build(
             requires_padding = (decode_lens_cpu.max() > decode_lens_cpu.min()).item()
 
             seq_lens = common_attn_metadata.seq_lens[:num_decodes]
-
-            self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
-                seq_lens, self.kv_cache_spec.block_size, self.num_sms
-            )
+            if is_deep_gemm_supported():
+                self.scheduler_metadata_buffer[:] = get_paged_mqa_logits_metadata(
+                    seq_lens, self.kv_cache_spec.block_size, self.num_sms
+                )
             decode_metadata = DeepSeekV32IndexerDecodeMetadata(
                 block_table=common_attn_metadata.block_table_tensor[:num_decodes, ...],
                 seq_lens=common_attn_metadata.seq_lens[:num_decodes],
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index e1864526f02c..00a0a77a1c2f 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -7,9 +7,8 @@
 import torch
 
 from vllm._aiter_ops import rocm_aiter_ops
-from vllm.attention.backends.abstract import AttentionLayer
+from vllm.attention.backends.abstract import AttentionLayer, MultipleOf
 from vllm.config import VllmConfig
-from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.mla.common import (
     MLACommonBackend,
     MLACommonDecodeMetadata,
@@ -22,6 +21,10 @@
 
 
 class AiterMLABackend(MLACommonBackend):
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [1]
+
     @staticmethod
     def get_name() -> str:
         return "ROCM_AITER_MLA"
@@ -46,6 +49,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     paged_kv_last_page_len: torch.Tensor | None = None
     # The query indptr, shape : [num_decode + 1]
     qo_indptr: torch.Tensor | None = None
+    # The dtype of MLA out tensor
+    attn_out_dtype: torch.dtype = torch.bfloat16
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -71,9 +76,9 @@ def __init__(
         )
 
         self.compilation_config = vllm_config.compilation_config
-        max_num_pages_per_req = cdiv(
-            vllm_config.model_config.max_model_len, self.kv_cache_spec.block_size
-        )
+        self.decode_attn_out_dtype = vllm_config.model_config.dtype
+        # kernel block size is always 1.
+        max_num_pages_per_req = vllm_config.model_config.max_model_len
         max_num_reqs = vllm_config.scheduler_config.max_num_seqs
         max_num_pages = max_num_reqs * max_num_pages_per_req
 
@@ -82,11 +87,6 @@ def __init__(
         # so we can only use the persistent buffer if a cudagraph is actually
         # being used.
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
-            self.block_table_remapping = torch.zeros(
-                [max_num_reqs, max_num_pages_per_req * self.kv_cache_spec.block_size],
-                dtype=torch.int32,
-                device=device,
-            )
             self.paged_kv_indptr = torch.zeros(
                 max_num_reqs + 1, dtype=torch.int32, device=device
             )
@@ -111,36 +111,16 @@ def _build_decode(
         num_decode_tokens: int,
         dcp_tot_seq_lens_device: torch.Tensor | None,
     ) -> AiterMLADecodeMetadata:
-        page_size = self.kv_cache_spec.block_size
+        # kernel block size is always 1, although the kv block size is not 1.
         device = self.device
         num_reqs = seq_lens_device.size(0)
-        bs, _ = block_table_tensor.shape
-        block_table_tensor = (
-            block_table_tensor.unsqueeze(-1).expand(-1, -1, page_size) * page_size
-        )
-        block_table_tensor = (
-            block_table_tensor
-            + torch.arange(
-                0,
-                page_size,
-                device=block_table_tensor.device,
-                dtype=block_table_tensor.dtype,
-            )[None, None, :]
-        )
-        block_table_tensor = block_table_tensor.view(bs, -1)
-
-        # after remapping, we assume the block size already equals to 1
 
-        max_blk_size_per_req = block_table_tensor.shape[-1]
         mask = torch.arange(
             block_table_tensor.size(1), dtype=block_table_tensor.dtype, device=device
         ).unsqueeze(0) < seq_lens_device.unsqueeze(1)
         paged_kv_indices = block_table_tensor[mask]
 
-        paged_kv_last_page_len = seq_lens_device % page_size
-        paged_kv_last_page_len = torch.where(
-            paged_kv_last_page_len == 0, page_size, paged_kv_last_page_len
-        )
+        paged_kv_last_page_len = torch.where(seq_lens_device == 0, 1, seq_lens_device)
 
         paged_kv_indptr = torch.cat(
             [
@@ -151,12 +131,6 @@ def _build_decode(
 
         if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
             num_actual_pages = paged_kv_indices.size(0)
-            self.block_table_remapping[:num_reqs, :max_blk_size_per_req].copy_(
-                block_table_tensor, non_blocking=True
-            )
-            block_table_tensor = self.block_table_remapping[
-                :num_reqs, :max_blk_size_per_req
-            ]
 
             self.paged_kv_indices[:num_actual_pages].copy_(
                 paged_kv_indices, non_blocking=True
@@ -191,6 +165,7 @@ def _build_decode(
             paged_kv_last_page_len=paged_kv_last_page_len,
             qo_indptr=qo_indptr,
             dcp_tot_seq_lens=dcp_tot_seq_lens_device,
+            attn_out_dtype=self.decode_attn_out_dtype,
         )
 
         return attn_metadata
@@ -271,7 +246,11 @@ def _forward_decode(
         assert isinstance(q, torch.Tensor)
         B = q.shape[0]
         o = torch.zeros(
-            B, self.num_heads, self.kv_lora_rank, dtype=q.dtype, device=q.device
+            B,
+            self.num_heads,
+            self.kv_lora_rank,
+            dtype=attn_metadata.decode.attn_out_dtype,
+            device=q.device,
         )
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
@@ -289,6 +268,8 @@ def _forward_decode(
             attn_metadata.decode.paged_kv_indptr,
             attn_metadata.decode.paged_kv_indices,
             attn_metadata.decode.paged_kv_last_page_len,
+            q_scale=layer._q_scale,
+            kv_scale=layer._k_scale,
         )
 
         return o, None
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
new file mode 100644
index 000000000000..c0e7f0e380b9
--- /dev/null
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py
@@ -0,0 +1,325 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, ClassVar, Optional
+
+import numpy as np
+import torch
+
+from vllm import _custom_ops as ops
+from vllm._aiter_ops import rocm_aiter_ops
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionLayer,
+    AttentionMetadata,
+)
+from vllm.attention.backends.utils import get_mla_dims
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.attention.backends.mla.common import (
+    MLACommonBaseImpl,
+)
+from vllm.v1.attention.backends.mla.flashmla_sparse import (
+    triton_convert_req_index_to_global_index,
+)
+from vllm.v1.attention.backends.utils import (
+    AttentionCGSupport,
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
+from vllm.v1.kv_cache_interface import AttentionSpec
+
+if TYPE_CHECKING:
+    from vllm.model_executor.models.deepseek_v2 import Indexer
+logger = init_logger(__name__)
+
+
+class ROCMAiterMLASparseBackend(AttentionBackend):
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_name() -> str:
+        return "ROCM_AITER_MLA_SPARSE"
+
+    @staticmethod
+    def get_metadata_cls() -> type[AttentionMetadata]:
+        return ROCMAiterMLASparseMetadata
+
+    @staticmethod
+    def get_builder_cls() -> type["ROCMAiterMLASparseMetadataBuilder"]:
+        return ROCMAiterMLASparseMetadataBuilder
+
+    @staticmethod
+    def get_impl_cls() -> type["ROCMAiterMLASparseImpl"]:
+        return ROCMAiterMLASparseImpl
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,  # assumed to be 1 for MLA
+        head_size: int,
+        cache_dtype_str: str = "auto",
+    ) -> tuple[int, ...]:
+        return (num_blocks, block_size, head_size)
+
+    @classmethod
+    def get_supported_dtypes(cls) -> list[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def get_supported_head_sizes(cls) -> list[int]:
+        return [576]
+
+
+@dataclass
+class ROCMAiterMLASparseMetadata:
+    num_reqs: int
+    max_query_len: int
+    max_seq_len: int
+
+    num_actual_tokens: int  # Number of tokens excluding padding.
+    query_start_loc: torch.Tensor
+    slot_mapping: torch.Tensor
+
+    block_table: torch.Tensor
+    req_id_per_token: torch.Tensor
+    block_size: int = 1
+    topk_tokens: int = 2048
+
+
+@dataclass
+class ROCMAiterMLASparseMetadataBuilder(
+    AttentionMetadataBuilder[ROCMAiterMLASparseMetadata]
+):
+    cudagraph_support: ClassVar[AttentionCGSupport] = AttentionCGSupport.NEVER
+
+    def __init__(
+        self,
+        kv_cache_spec: AttentionSpec,
+        layer_names: list[str],
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.kv_cache_spec = kv_cache_spec
+        self.model_config = vllm_config.model_config
+        parallel_config = vllm_config.parallel_config
+        self.device = device
+
+        self.num_heads = self.model_config.get_num_attention_heads(parallel_config)
+        self.mla_dims = get_mla_dims(self.model_config)
+        self.topk_tokens = vllm_config.model_config.hf_config.index_topk
+        self.topk_tokens_tensor = torch.tensor(
+            [self.topk_tokens], device=device, dtype=torch.int32
+        )
+        self.max_model_len_tensor = torch.tensor(
+            [self.model_config.max_model_len], device=device, dtype=torch.int32
+        )
+        # this is ignored by `flash_mla_with_kvcache` if indices not None
+        self.dummy_block_table = torch.empty(
+            (1, 1), dtype=torch.int32, device=self.device
+        )
+
+        self.req_id_per_token_buffer = torch.empty(
+            (vllm_config.scheduler_config.max_num_batched_tokens,),
+            dtype=torch.int32,
+            device=device,
+        )
+
+    def build(
+        self,
+        common_prefix_len: int,
+        common_attn_metadata: CommonAttentionMetadata,
+        fast_build: bool = False,
+    ) -> ROCMAiterMLASparseMetadata:
+        num_tokens = common_attn_metadata.num_actual_tokens
+        starts = np.asarray(common_attn_metadata.query_start_loc_cpu, dtype=np.int32)
+        seg_lengths = np.diff(starts)
+        req_id_per_token = np.repeat(
+            np.arange(seg_lengths.shape[0], dtype=np.int32), seg_lengths
+        )
+        # Zero-fill for cudagraphs
+        self.req_id_per_token_buffer.fill_(0)
+        self.req_id_per_token_buffer[: req_id_per_token.shape[0]].copy_(
+            torch.from_numpy(req_id_per_token), non_blocking=True
+        )
+        req_id_per_token = self.req_id_per_token_buffer[:num_tokens]
+
+        metadata = ROCMAiterMLASparseMetadata(
+            num_reqs=common_attn_metadata.num_reqs,
+            max_query_len=common_attn_metadata.max_query_len,
+            max_seq_len=common_attn_metadata.max_seq_len,
+            num_actual_tokens=common_attn_metadata.num_actual_tokens,
+            query_start_loc=common_attn_metadata.query_start_loc,
+            slot_mapping=common_attn_metadata.slot_mapping,
+            block_table=common_attn_metadata.block_table_tensor,
+            req_id_per_token=req_id_per_token,
+            block_size=self.kv_cache_spec.block_size,
+            topk_tokens=self.topk_tokens,
+        )
+        return metadata
+
+
+# Take from
+# https://github.com/deepseek-ai/FlashMLA/blob/main/tests/test_flash_mla_prefill.py#L72
+def reference_mla_sparse_prefill(
+    q: torch.Tensor, kv: torch.Tensor, indices: torch.Tensor, sm_scale: float, d_v: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    import math
+
+    def log2sumexp2(a: torch.Tensor, dim: int) -> torch.Tensor:
+        return torch.logsumexp(a * math.log(2), dim=dim) * math.log2(math.e)
+
+    skv = kv.shape[0]
+    sq = q.shape[0]
+    topk = indices.shape[-1]
+    dqk = q.shape[-1]
+    indices = indices[:, 0, :]  # [s_q, topk]
+    invalid_indices_mask = (indices < 0) | (indices >= skv)
+    indices[invalid_indices_mask] = 0
+    qs = q  # [s_q, h_q, d_qk]
+    kvs = kv[:, 0, :][indices].view(sq, topk, dqk)  # [s_q, topk, d_qk]
+
+    attn_score = (qs @ kvs.transpose(1, 2)).float()  # [s_q, h_q, topk]
+    attn_score.masked_fill_(invalid_indices_mask.unsqueeze(1), float("-inf"))
+    attn_score *= sm_scale * math.log2(math.e)
+    lse = log2sumexp2(attn_score, dim=-1)  # [s_q, h_q]
+    attn_score = torch.exp2(attn_score - lse.unsqueeze(-1))  # [s_q, h_q, topk]
+    result = attn_score.to(q.dtype) @ kvs[:, :, :d_v]
+    return (result, lse)
+
+
+class ROCMAiterMLASparseImpl(MLACommonBaseImpl[ROCMAiterMLASparseMetadata]):
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: list[float] | None,
+        sliding_window: int | None,
+        kv_cache_dtype: str,
+        logits_soft_cap: float | None,
+        attn_type: str,
+        kv_sharing_target_layer_name: str | None,
+        # MLA Specific Arguments
+        topk_indice_buffer: torch.Tensor | None = None,
+        indexer: Optional["Indexer"] = None,
+        **mla_args,
+    ) -> None:
+        super().__init__(
+            num_heads,
+            head_size,
+            scale,
+            num_kv_heads,
+            alibi_slopes,
+            sliding_window,
+            kv_cache_dtype,
+            logits_soft_cap,
+            attn_type,
+            kv_sharing_target_layer_name,
+            **mla_args,
+        )
+        self.softmax_scale = scale
+        assert indexer is not None
+        self.topk_indices_buffer = indexer.topk_indices_buffer
+        self.is_fp8bmm_enabled = rocm_aiter_ops.is_fp8bmm_enabled()
+
+    def _forward_bf16_kv(
+        self,
+        q: torch.Tensor,
+        kv_c_and_k_pe_cache: torch.Tensor,
+        topk_indices: torch.Tensor,
+        attn_metadata: ROCMAiterMLASparseMetadata,
+    ) -> torch.Tensor:
+        num_tokens = q.shape[0]
+        kv_c_and_k_pe_cache = kv_c_and_k_pe_cache.view(
+            -1, 1, kv_c_and_k_pe_cache.shape[-1]
+        )
+
+        topk_indices = topk_indices.view(num_tokens, 1, -1)
+        output = reference_mla_sparse_prefill(
+            q, kv_c_and_k_pe_cache, topk_indices, self.softmax_scale, 512
+        )[0]
+        return output[:, : self.num_heads, :]
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        q: torch.Tensor,
+        k_c_normed: torch.Tensor,  # key in unified attn
+        k_pe: torch.Tensor,  # value in unified attn
+        kv_cache: torch.Tensor,
+        attn_metadata: ROCMAiterMLASparseMetadata,
+        output: torch.Tensor | None = None,
+        output_scale: torch.Tensor | None = None,
+        output_block_scale: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        # NOTE(lucas): for the sparse FlashMLA kernels the kernels want to use
+        # MQA 576/512 approach for both prefill and decode
+
+        assert output is not None, "Output tensor must be provided."
+
+        if output_scale is not None or output_block_scale is not None:
+            raise NotImplementedError(
+                "fused output quantization is not yet supported for ROCMAiterMLASparse"
+            )
+
+        if attn_metadata is None:
+            # The zero fill is required when used with DP + EP
+            # to ensure all ranks within a DP group compute the
+            # same expert outputs.
+            return output.fill_(0)
+
+        num_actual_toks = attn_metadata.num_actual_tokens
+
+        # Inputs and outputs may be padded for CUDA graphs
+
+        q = q[:num_actual_toks, ...]
+        k_c_normed = k_c_normed[:num_actual_toks, ...]
+        k_pe = k_pe[:num_actual_toks, ...]
+
+        q_nope, q_pe = q.split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        # Convert from (B, N, P) to (N, B, P)
+        q_nope = q_nope.transpose(0, 1)
+        if self.is_fp8bmm_enabled:
+            # Multiply+Transpose (N, B, P)x(N, P, L)->(N, B, L)->(B, N, L)
+            ql_nope = rocm_aiter_ops.triton_fp8_bmm(
+                q_nope, self.W_K, self.W_K_scale, group_size=128, transpose_bm=True
+            )
+        else:
+            # Multiply (N, B, P) x (N, P, L) -> (N, B, L)
+            ql_nope = torch.bmm(q_nope, self.W_UK_T)
+            # Convert from (N, B, L) to (B, N, L)
+            ql_nope = ql_nope.transpose(0, 1)
+
+        topk_indices = self.topk_indices_buffer[:num_actual_toks]
+
+        topk_indices_global = triton_convert_req_index_to_global_index(
+            attn_metadata.req_id_per_token,
+            attn_metadata.block_table,
+            topk_indices,
+            BLOCK_SIZE=attn_metadata.block_size,
+            NUM_TOPK_TOKENS=attn_metadata.topk_tokens,
+        )
+
+        q = torch.cat([ql_nope, q_pe], dim=-1)
+
+        # write the latent and rope to kv cache
+        if kv_cache.numel() > 0:
+            ops.concat_and_cache_mla(
+                k_c_normed,
+                k_pe.squeeze(1),
+                kv_cache,
+                attn_metadata.slot_mapping.flatten(),
+                kv_cache_dtype=self.kv_cache_dtype,
+                scale=layer._k_scale,
+            )
+
+        attn_out = self._forward_bf16_kv(
+            q, kv_cache, topk_indices_global, attn_metadata
+        )
+
+        self._v_up_proj(attn_out, out=output[:num_actual_toks])
+        return output
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
index c7f925817a6a..b6aa0ae2be48 100644
--- a/vllm/v1/attention/backends/rocm_aiter_fa.py
+++ b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -13,11 +13,13 @@
     AttentionType,
     MultipleOf,
 )
+from vllm.attention.layer import Attention
 from vllm.attention.ops.merge_attn_states import merge_attn_states
-from vllm.config import VllmConfig
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils.math_utils import cdiv
+from vllm.utils.platform_utils import get_cu_count
 from vllm.v1.attention.backends.utils import (
     AttentionCGSupport,
     AttentionMetadataBuilder,
@@ -38,7 +40,7 @@ def block_size(x, head_dim):
         return min(65536 // x.element_size(), triton.next_power_of_2(head_dim))
 
     def num_programs(total_tokens):
-        return min(total_tokens, current_platform.get_cu_count())
+        return min(total_tokens, get_cu_count())
 
     @triton.jit
     def cp_mha_gather_cache_kernel(
@@ -56,58 +58,55 @@ def cp_mha_gather_cache_kernel(
         head_size,
         x,
         max_block_num,
-        num_tokens,
-        num_programs,
         DEQUANT: tl.constexpr,
         PAGE_SIZE: tl.constexpr,
         CACHE_FORMAT: tl.constexpr,
         BLOCK_SIZE: tl.constexpr,
     ):
-        bid = tl.program_id(0)
+        token_id = tl.program_id(0)
         col_offsets = tl.arange(0, BLOCK_SIZE)
         if DEQUANT:
             k_scale = tl.load(k_scale_ptr)
             v_scale = tl.load(v_scale_ptr)
 
-        for token_id in tl.range(bid, num_tokens, num_programs):
-            key_ptr_offset = key_ptr + token_id * head_size * num_heads
-            value_ptr_offset = value_ptr + token_id * head_size * num_heads
-            batch_idx = tl.load(token_to_batch_ptr + token_id)
-            batch_start = tl.load(seq_start_ptr + batch_idx)
-            token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
-            batch_offset = token_id - token_start + batch_start
-            block_offset = batch_offset // PAGE_SIZE
-            block_id = tl.load(
-                block_table_ptr + max_block_num * batch_idx + block_offset
+        key_ptr_offset = key_ptr + token_id * head_size * num_heads
+        value_ptr_offset = value_ptr + token_id * head_size * num_heads
+        batch_idx = tl.load(token_to_batch_ptr + token_id)
+        batch_start = tl.load(seq_start_ptr + batch_idx)
+        token_start = tl.load(cu_seqlens_kv_ptr + batch_idx)
+        batch_offset = token_id - token_start + batch_start
+        block_offset = batch_offset // PAGE_SIZE
+        block_id = tl.load(
+            block_table_ptr + max_block_num * batch_idx + block_offset
+        ).to(tl.int64)
+        slot_id = batch_offset % PAGE_SIZE
+
+        if CACHE_FORMAT == "NHD":
+            # for kv cache layout as
+            # K: [num_blocks, page_size, num_head, head_dim]
+            # V: [num_blocks, page_size, num_head, head_dim]
+            key_cache_ptr_offset = (
+                key_cache_ptr
+                + block_id * num_heads * head_size * PAGE_SIZE
+                + slot_id * num_heads * head_size
+            )
+            value_cache_ptr_offset = (
+                value_cache_ptr
+                + block_id * num_heads * head_size * PAGE_SIZE
+                + slot_id * num_heads * head_size
             )
-            slot_id = batch_offset % PAGE_SIZE
-
-            if CACHE_FORMAT == "NHD":
-                # for kv cache layout as
-                # K: [num_blocks, page_size, num_head, head_dim]
-                # V: [num_blocks, page_size, num_head, head_dim]
-                key_cache_ptr_offset = (
-                    key_cache_ptr
-                    + block_id * num_heads * head_size * PAGE_SIZE
-                    + slot_id * num_heads * head_size
-                )
-                value_cache_ptr_offset = (
-                    value_cache_ptr
-                    + block_id * num_heads * head_size * PAGE_SIZE
-                    + slot_id * num_heads * head_size
-                )
 
-                for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
-                    mask = (col_offsets + i) < head_size * num_heads
-                    k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
-                    v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
-                    if DEQUANT:
-                        k_dtype = k_reg.dtype
-                        v_dtype = v_reg.dtype
-                        k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
-                        v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
-                    tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
-                    tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
+            for i in tl.range(0, head_size * num_heads, BLOCK_SIZE):
+                mask = (col_offsets + i) < head_size * num_heads
+                k_reg = tl.load(key_cache_ptr_offset + col_offsets + i, mask=mask)
+                v_reg = tl.load(value_cache_ptr_offset + col_offsets + i, mask=mask)
+                if DEQUANT:
+                    k_dtype = k_reg.dtype
+                    v_dtype = v_reg.dtype
+                    k_reg = (k_reg.to(tl.float32) * k_scale).to(k_dtype)
+                    v_reg = (v_reg.to(tl.float32) * v_scale).to(v_dtype)
+                tl.store(key_ptr_offset + col_offsets + i, k_reg, mask=mask)
+                tl.store(value_ptr_offset + col_offsets + i, v_reg, mask=mask)
 
     def cp_mha_gather_cache(
         key_cache: torch.Tensor,
@@ -142,9 +141,7 @@ def cp_mha_gather_cache(
         page_size = key_cache.shape[1]
         num_heads = key_cache.shape[2]
 
-        NUM_PRGMS = num_programs(total_tokens)
-        BLOCK_SIZE = block_size(key_cache, head_dim)
-        grid = lambda meta: (NUM_PRGMS,)
+        grid = lambda meta: (total_tokens,)
         cp_mha_gather_cache_kernel[grid](
             key_cache,
             value_cache,
@@ -160,12 +157,10 @@ def cp_mha_gather_cache(
             head_dim,
             x,
             block_tables.size(1),
-            total_tokens,
-            NUM_PRGMS,
             DEQUANT=dequant,
             PAGE_SIZE=page_size,
             CACHE_FORMAT=kv_cache_layout,
-            BLOCK_SIZE=BLOCK_SIZE,
+            BLOCK_SIZE=head_dim,
         )
 
 
@@ -188,6 +183,17 @@ class AiterFlashAttentionPrefillMetadata:
     query_start_loc: torch.Tensor
 
 
+@dataclass
+class AiterChunkSlidingWindowMetadata:
+    swa_seqlens: torch.Tensor
+    swa_cu_seqlens: torch.Tensor
+    swa_seq_starts: torch.Tensor
+    swa_token_to_batch: torch.Tensor
+    swa_max_seqlens: int
+    swa_total_tokens: int
+    swa_workspace: torch.Tensor
+
+
 @dataclass
 class AiterChunkContextMetadata:
     workspace: torch.Tensor
@@ -199,6 +205,7 @@ class AiterChunkContextMetadata:
     seq_lens: torch.Tensor
     num_chunks: int
     total_token_per_batch: list[int]
+    swa_metadata: AiterChunkSlidingWindowMetadata | None
 
 
 @dataclass
@@ -277,6 +284,20 @@ def __init__(
         self.aot_sliding_window: tuple[int, int] | None = None
         self.total_tokens: int = 0
 
+        sliding_window_configs: set[tuple[int, int] | None] = set()
+        layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for layer in layers.values():
+            assert isinstance(layer.impl, AiterFlashAttentionImpl)
+            sliding_window_configs.add(layer.impl.sliding_window)
+
+        while len(sliding_window_configs) > 0:
+            sliding_window_config = sliding_window_configs.pop()
+            if sliding_window_config is not None and sliding_window_config[0] != -1:
+                assert self.aot_sliding_window is None, (
+                    "Aiter Flash ATTENTION can only support one valid sliding window!"
+                )
+                self.aot_sliding_window = sliding_window_config
+
         self.extend_workspace = torch.empty(
             [2, _CP_TOKENS_PER_ITER_ROCM, self.num_heads_kv, self.headdim],
             dtype=self.model_config.dtype,
@@ -348,6 +369,55 @@ def build(
             query_lens_for_extend = query_lens_cpu[num_extends_slice]
             seq_lens_for_extend = common_attn_metadata.seq_lens_cpu[num_extends_slice]
             computed_kv_lens = seq_lens_for_extend - query_lens_for_extend
+            swa_metadata = None
+            if self.aot_sliding_window is not None:
+                swa_seqlen_for_extend = torch.minimum(
+                    seq_lens_for_extend,
+                    query_lens_for_extend + self.aot_sliding_window[0] + 1,
+                )
+                cu_seq_lens = torch.zeros(
+                    num_extends + 1,
+                    dtype=torch.int32,
+                    device=seq_lens_for_extend.device,
+                )
+                torch.cumsum(
+                    swa_seqlen_for_extend,
+                    dim=0,
+                    dtype=cu_seq_lens.dtype,
+                    out=cu_seq_lens[1:],
+                )
+                token_to_seq = torch.arange(
+                    0,
+                    num_extends,
+                    dtype=torch.int32,
+                    device=seq_lens_for_extend.device,
+                )
+                token_to_seq = torch.repeat_interleave(
+                    token_to_seq, swa_seqlen_for_extend
+                )
+                fetched_shape = cu_seq_lens[-1].item()
+                # TODO(ganyi): Maybe reuse these 2 buffer from extend_workspace
+                swa_workspace = torch.empty(
+                    (2, fetched_shape, self.num_heads_kv, self.headdim),
+                    dtype=self.vllm_config.model_config.dtype,
+                    device=self.device,
+                )
+
+                seq_starts = seq_lens_for_extend - swa_seqlen_for_extend
+                max_seqlen_k = swa_seqlen_for_extend.max().item()
+                total_tokens = cu_seq_lens[-1].item()
+
+                swa_metadata = AiterChunkSlidingWindowMetadata(
+                    swa_seqlens=swa_seqlen_for_extend.to(
+                        self.device, non_blocking=True
+                    ),
+                    swa_cu_seqlens=cu_seq_lens.to(self.device, non_blocking=True),
+                    swa_seq_starts=seq_starts.to(self.device, non_blocking=True),
+                    swa_token_to_batch=token_to_seq.to(self.device, non_blocking=True),
+                    swa_max_seqlens=max_seqlen_k,
+                    swa_total_tokens=total_tokens,
+                    swa_workspace=swa_workspace,
+                )
 
             # allocate the equal amount of workspace for
             # each chunk prefill request
@@ -391,6 +461,7 @@ def build(
                 token_to_batch=token_to_batch_tensor.to(self.device, non_blocking=True),
                 num_chunks=num_chunks,
                 total_token_per_batch=cu_seq_lens_cpu[:, -1].tolist(),
+                swa_metadata=swa_metadata,
             )
 
             query_start_loc_device = common_attn_metadata.query_start_loc[
@@ -446,7 +517,10 @@ def use_cascade_attention(self, *args, **kwargs) -> bool:
 class AiterFlashAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
@@ -500,9 +574,9 @@ def __init__(
             alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
         self.alibi_slopes = alibi_slopes
         if sliding_window is None:
-            self.sliding_window = [-1, -1]
+            self.sliding_window = (-1, -1)
         else:
-            self.sliding_window = [sliding_window - 1, 0]
+            self.sliding_window = (sliding_window - 1, 0)
         self.kv_cache_dtype = kv_cache_dtype
         if logits_soft_cap is None:
             # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
@@ -513,14 +587,72 @@ def __init__(
         assert self.num_heads % self.num_kv_heads == 0
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "FlashAttentionImpl"
+                "Encoder self-attention is not implemented for FlashAttentionImpl"
             )
 
+    def extend_for_sliding_window(
+        self,
+        attn_metadata: AiterFlashAttentionMetadata,
+        query: torch.Tensor,
+        key_cache,
+        value_cache,
+        output: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        max_seqlen_q: int,
+        block_table: torch.Tensor,
+        k_scale: float,
+        v_scale: float,
+    ):
+        assert attn_metadata.extend_metadata is not None
+        assert attn_metadata.extend_metadata.chunk_context_metadata is not None
+        chunked_metadata = attn_metadata.extend_metadata.chunk_context_metadata
+        swa_metadata = chunked_metadata.swa_metadata
+        assert swa_metadata is not None
+        swa_cu_seqlens = swa_metadata.swa_cu_seqlens
+        swa_seq_starts = swa_metadata.swa_seq_starts
+        swa_token_to_batch = swa_metadata.swa_token_to_batch
+        swa_max_seqlens = swa_metadata.swa_max_seqlens
+        swa_total_tokens = swa_metadata.swa_total_tokens
+        key_fetched, value_fetched = (
+            swa_metadata.swa_workspace[0],
+            swa_metadata.swa_workspace[1],
+        )
+        cp_mha_gather_cache(
+            key_cache=key_cache,
+            value_cache=value_cache,
+            key=key_fetched,
+            value=value_fetched,
+            block_tables=block_table,
+            k_scales=k_scale,
+            v_scales=v_scale,
+            cu_seqlens_kv=swa_cu_seqlens,
+            token_to_batch=swa_token_to_batch,
+            seq_starts=swa_seq_starts,
+            dequant=False,
+            kv_cache_layout="NHD",
+            total_tokens=swa_total_tokens,
+        )
+
+        aiter.flash_attn_varlen_func(
+            q=query,
+            k=key_fetched,
+            v=value_fetched,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=swa_cu_seqlens,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=swa_max_seqlens,
+            min_seqlen_q=1,
+            dropout_p=0.0,
+            softmax_scale=self.scale,
+            causal=True,
+            window_size=self.sliding_window,
+            alibi_slopes=self.alibi_slopes,
+            return_lse=False,
+            out=output,
+        )
+
     def extend_forward(
         self,
         attn_metadata: AiterFlashAttentionMetadata,
@@ -539,6 +671,20 @@ def extend_forward(
         k_scale: float,
         v_scale: float,
     ):
+        if self.sliding_window[0] != -1:
+            self.extend_for_sliding_window(
+                attn_metadata,
+                query,
+                key_cache,
+                value_cache,
+                output,
+                cu_seqlens_q,
+                max_seqlen_q,
+                block_table,
+                k_scale,
+                v_scale,
+            )
+            return
         out, lse = aiter.flash_attn_varlen_func(
             q=query,
             k=key,
@@ -674,7 +820,14 @@ def forward(
         # performance to make sure it does not introduce any overhead.
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             # NOTE(woosuk): Here, key and value are padded while slot_mapping
@@ -700,8 +853,10 @@ def forward(
 
         # decode:extend:prefill
         query = query[:num_actual_tokens]
-        key = key[:num_actual_tokens]
-        value = value[:num_actual_tokens]
+        if key is not None:
+            key = key[:num_actual_tokens]
+        if value is not None:
+            value = value[:num_actual_tokens]
 
         output_actual_tokens = output[:num_actual_tokens]
 
@@ -728,7 +883,7 @@ def forward(
                     cu_seqlens_k=attn_metadata.prefill_metadata.query_start_loc,
                     max_seqlen_q=attn_metadata.prefill_metadata.max_query_len,
                     max_seqlen_k=attn_metadata.prefill_metadata.max_seq_len,
-                    min_seqlen_q=attn_metadata.prefill_metadata.min_query_len,
+                    min_seqlen_q=1,
                     dropout_p=0.0,
                     softmax_scale=self.scale,
                     causal=True,
@@ -758,7 +913,7 @@ def forward(
                     cu_seqlens_q=attn_metadata.extend_metadata.query_start_loc,
                     max_seqlen_q=attn_metadata.extend_metadata.max_query_len,
                     max_seqlen_k=attn_metadata.extend_metadata.max_seq_len,
-                    min_seqlen_q=attn_metadata.extend_metadata.min_query_len,
+                    min_seqlen_q=1,
                     block_table=attn_metadata.block_table[
                         num_decodes : num_decodes + num_extends
                     ],
@@ -772,6 +927,36 @@ def forward(
             # calculate for decodes
             if num_decodes > 0:
                 assert attn_metadata.decode_metadata is not None
+                if self.sliding_window[0] != -1:
+                    from aiter.ops.triton.unified_attention import (
+                        unified_attention,
+                    )
+
+                    descale_shape = (
+                        attn_metadata.query_start_loc[:num_decodes].shape[0] - 1,
+                        key_cache.shape[2],
+                    )
+                    unified_attention(
+                        q=query[:num_decode_tokens],
+                        k=key_cache,
+                        v=value_cache,
+                        out=output[:num_decode_tokens],
+                        cu_seqlens_q=attn_metadata.query_start_loc[:num_decodes],
+                        max_seqlen_q=1,  # optimize this
+                        seqused_k=attn_metadata.seq_lens[:num_decodes],
+                        max_seqlen_k=attn_metadata.max_seq_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        alibi_slopes=self.alibi_slopes,
+                        window_size=self.sliding_window,
+                        block_table=attn_metadata.block_table[:num_decodes],
+                        softcap=self.logits_soft_cap,
+                        q_descale=None,
+                        k_descale=layer._k_scale.expand(descale_shape),
+                        v_descale=layer._v_scale.expand(descale_shape),
+                    )
+                    return
+                assert attn_metadata.decode_metadata is not None
                 _, num_heads, head_size = query.shape
                 nbytes_per_qo_elem = torch.finfo(query.dtype).bits // 8
                 num_seqs = attn_metadata.seq_lens.shape[0]
diff --git a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
index b2639c0df041..16fb52ab501c 100644
--- a/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
+++ b/vllm/v1/attention/backends/rocm_aiter_unified_attn.py
@@ -142,7 +142,14 @@ def forward(
 
         key_cache, value_cache = kv_cache.unbind(0)
 
-        if self.kv_sharing_target_layer_name is None:
+        # key and value may be None in the case of cross attention. They are
+        # calculated once based on the output from the encoder and then cached
+        # in KV cache.
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             ops.reshape_and_cache_flash(
@@ -169,7 +176,10 @@ def forward(
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+        descale_shape = (
+            cu_seqlens_q.shape[0] - 1,
+            key.shape[1] if key is not None else self.num_kv_heads,
+        )
 
         self.unified_attention(
             q=query[:num_actual_tokens],
diff --git a/vllm/v1/attention/backends/rocm_attn.py b/vllm/v1/attention/backends/rocm_attn.py
index 6dfdfc19ccba..868143cc192e 100644
--- a/vllm/v1/attention/backends/rocm_attn.py
+++ b/vllm/v1/attention/backends/rocm_attn.py
@@ -238,12 +238,9 @@ def __init__(
 
         RocmAttentionBackend.validate_head_size(head_size)
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "RocmAttentionImpl"
+                "Encoder self-attention is not implemented for RocmAttentionImpl"
             )
 
         self.fp8_dtype = current_platform.fp8_dtype()
diff --git a/vllm/v1/attention/backends/tree_attn.py b/vllm/v1/attention/backends/tree_attn.py
index 1bf38ed225a4..523f759e05a2 100644
--- a/vllm/v1/attention/backends/tree_attn.py
+++ b/vllm/v1/attention/backends/tree_attn.py
@@ -31,7 +31,10 @@
 class TreeAttentionBackend(AttentionBackend):
     accept_output_buffer: bool = True
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
+
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
 
     @classmethod
     def get_supported_head_sizes(cls) -> list[int]:
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 889c79db18ef..d051a89f03bb 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -154,7 +154,6 @@ class TritonAttentionBackend(AttentionBackend):
         torch.bfloat16,
         torch.float32,
     ]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
         "auto",
         "fp8",
@@ -162,6 +161,10 @@ class TritonAttentionBackend(AttentionBackend):
         "fp8_e5m2",
     ]
 
+    @staticmethod
+    def get_supported_kernel_block_sizes() -> list[int | MultipleOf]:
+        return [MultipleOf(16)]
+
     @staticmethod
     def get_name() -> str:
         return "TRITON_ATTN"
@@ -244,14 +247,11 @@ def __init__(
 
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        if attn_type != AttentionType.DECODER:
+        if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]:
             raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "TritonAttentionImpl"
+                "Encoder self-attention is not implemented for TritonAttentionImpl"
             )
-
+        self.attn_type = attn_type
         self.fp8_dtype = current_platform.fp8_dtype()
 
         self.sinks = sinks
@@ -312,7 +312,11 @@ def forward(
         num_actual_tokens = attn_metadata.num_actual_tokens
         key_cache, value_cache = kv_cache.unbind(1)
 
-        if self.kv_sharing_target_layer_name is None:
+        if (
+            self.kv_sharing_target_layer_name is None
+            and key is not None
+            and value is not None
+        ):
             # Reshape the input keys and values and store them in the cache.
             # Skip this if sharing KV cache with an earlier attention layer.
             if self.kv_cache_dtype.startswith("fp8"):
@@ -346,7 +350,7 @@ def forward(
         max_seqlen_k = attn_metadata.max_seq_len
         block_table = attn_metadata.block_table
 
-        descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])
+        descale_shape = (cu_seqlens_q.shape[0] - 1, key_cache.shape[2])
 
         unified_attention(
             q=query[:num_actual_tokens],
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index fd37a665cf05..8edfbb5140bc 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -24,12 +24,15 @@
 from vllm.utils.math_utils import cdiv
 
 if TYPE_CHECKING:
-    from vllm.attention.backends.abstract import AttentionImpl
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
 import vllm.envs as envs
-from vllm.attention.backends.abstract import AttentionBackend, AttentionMetadata
+from vllm.attention.backends.abstract import (
+    AttentionBackend,
+    AttentionImpl,
+    AttentionMetadata,
+)
 from vllm.distributed.kv_transfer.kv_connector.utils import (
     get_kv_connector_cache_layout,
 )
@@ -72,6 +75,7 @@ class CommonAttentionMetadata:
 
     num_reqs: int
     """Number of requests"""
+    # TODO(lucas): rename to num_tokens since it may be padded and this is misleading
     num_actual_tokens: int
     """Total number of tokens in batch"""
     max_query_len: int
@@ -89,11 +93,39 @@ class CommonAttentionMetadata:
     num_logits_indices: int | None = None
 
     # Needed by CrossAttentionBuilder
-    encoder_seq_lens: np.ndarray | None = None
+    encoder_seq_lens: torch.Tensor | None = None
+    encoder_seq_lens_cpu: np.ndarray | None = None
 
     dcp_local_seq_lens: torch.Tensor | None = None
+    dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    # TODO(lucas): remove once we have FULL-CG spec-decode support
+    def unpadded(
+        self, num_actual_tokens: int, num_actual_reqs: int
+    ) -> "CommonAttentionMetadata":
+        maybe_slice_reqs = lambda x: x[:num_actual_reqs] if x is not None else None
+        return CommonAttentionMetadata(
+            query_start_loc=self.query_start_loc[: num_actual_reqs + 1],
+            query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1],
+            seq_lens=self.seq_lens[:num_actual_reqs],
+            seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs],
+            num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs],
+            num_reqs=num_actual_reqs,
+            num_actual_tokens=num_actual_tokens,
+            max_query_len=self.max_query_len,
+            max_seq_len=self.max_seq_len,
+            block_table_tensor=self.block_table_tensor[:num_actual_reqs],
+            slot_mapping=self.slot_mapping[:num_actual_tokens],
+            causal=self.causal,
+            logits_indices_padded=self.logits_indices_padded,
+            num_logits_indices=self.num_logits_indices,
+            encoder_seq_lens=maybe_slice_reqs(self.encoder_seq_lens),
+            encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu),
+            dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens),
+            dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu),
+        )
+
 
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
@@ -134,9 +166,7 @@ def _make_metadata_with_slice(
     assert start_locs[first_req] <= first_tok < start_locs[first_req + 1], (
         "Token slice start outside of first request"
     )
-    assert start_locs[last_req] <= last_tok < start_locs[last_req + 1], (
-        "Token slice end outside of last request"
-    )
+    # NOTE: last token can be outside of the last request if we have CG padding.
 
     # If the "middle" request has tokens in both ubatches, we have to split it.
     # If ubatch_slice is the first ubatch then we will be splitting the last
@@ -855,7 +885,9 @@ def split_decodes_and_prefills(
     if require_uniform:
         is_prefill = query_lens != query_lens[0]
     else:
-        is_prefill = query_lens > decode_threshold
+        # 0-query len indicates a padded request; leave this at the back
+        # of the batch with the prefills
+        is_prefill = (query_lens > decode_threshold) | (query_lens == 0)
 
     if not torch.any(is_prefill):
         return num_reqs, 0, num_tokens, 0
@@ -965,12 +997,6 @@ def reshape_attn_output_for_spec_decode(attn_output: torch.Tensor) -> torch.Tens
     return attn_output.view(total_tokens, attn_output.shape[2], attn_output.shape[3])
 
 
-KV_SHARING_FAST_PREFILL_METADATA_FIELDS = [
-    ("logits_indices_padded", torch.Tensor | None, None),
-    ("num_logits_indices", int, 0),
-]
-
-
 def subclass_attention_metadata(
     name_prefix: str,
     metadata_cls: Any,
@@ -986,8 +1012,8 @@ def subclass_attention_metadata(
 
 @runtime_checkable
 class KVSharingFastPrefillMetadata(Protocol):
-    logits_indices_padded: torch.Tensor
-    num_logits_indices: int
+    logits_indices_padded: torch.Tensor | None = None
+    num_logits_indices: int | None = None
 
 
 def create_fast_prefill_custom_backend(
@@ -1019,11 +1045,6 @@ def __init__(self, metadata, common_attn_metadata):
                     for _field in fields(metadata.__class__):
                         setattr(self, _field.name, getattr(metadata, _field.name))
 
-                    # Set additional fields that will be used in model code
-                    assert (
-                        common_attn_metadata.logits_indices_padded is not None
-                        and common_attn_metadata.num_logits_indices is not None
-                    )
                     self.logits_indices_padded = (
                         common_attn_metadata.logits_indices_padded
                     )
@@ -1090,9 +1111,9 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor):
 
 def get_dcp_local_seq_lens(
     seq_lens: torch.Tensor,
-    dcp_world_size: int = 1,
+    dcp_size: int = 1,
     dcp_rank: int | None = None,
-    dcp_kv_cache_interleave_size: int = 1,
+    cp_kv_cache_interleave_size: int = 1,
 ) -> torch.Tensor:
     """While using dcp, kv_cache size stored on each rank may be different,
     use this function to calculate split decode seq_lens of each dcp rank.
@@ -1101,26 +1122,28 @@ def get_dcp_local_seq_lens(
     num_requests = seq_lens.size(0)
     if dcp_rank is None:
         rank_offsets = (
-            torch.arange(dcp_world_size, dtype=torch.int32)
+            torch.arange(dcp_size, dtype=torch.int32, device=seq_lens.device)
             .unsqueeze(0)
             .repeat(num_requests, 1)
         )
     else:
-        rank_offsets = torch.Tensor([[dcp_rank]]).to(dtype=torch.int32)
+        rank_offsets = torch.tensor(
+            [[dcp_rank]], dtype=torch.int32, device=seq_lens.device
+        )
     seq_lens_tiled = (
         seq_lens.to(torch.int32).unsqueeze(-1).repeat(1, rank_offsets.shape[1])
     )
     base = (
         seq_lens_tiled
-        // dcp_kv_cache_interleave_size
-        // dcp_world_size
-        * dcp_kv_cache_interleave_size
+        // cp_kv_cache_interleave_size
+        // dcp_size
+        * cp_kv_cache_interleave_size
     )
-    remainder = seq_lens_tiled - base * dcp_world_size
+    remainder = seq_lens_tiled - base * dcp_size
     remainder = torch.clip(
-        remainder - rank_offsets * dcp_kv_cache_interleave_size,
+        remainder - rank_offsets * cp_kv_cache_interleave_size,
         0,
-        dcp_kv_cache_interleave_size,
+        cp_kv_cache_interleave_size,
     )
     dcp_local_seq_lens = base + remainder
     return dcp_local_seq_lens.squeeze(1)
diff --git a/vllm/v1/attention/backends/xformers.py b/vllm/v1/attention/backends/xformers.py
deleted file mode 100644
index d15d79417cc6..000000000000
--- a/vllm/v1/attention/backends/xformers.py
+++ /dev/null
@@ -1,417 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with XFormersAttention."""
-
-from dataclasses import dataclass
-from typing import ClassVar, Optional
-
-import torch
-
-from vllm.attention.backends.abstract import (
-    AttentionBackend,
-    AttentionImpl,
-    AttentionType,
-    MultipleOf,
-)
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder,
-    CommonAttentionMetadata,
-    split_decodes_and_prefills,
-)
-from vllm.v1.kv_cache_interface import AttentionSpec
-
-try:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import (
-        AttentionBias,
-        PagedBlockDiagonalCausalWithOffsetPaddedKeysMask,
-    )
-
-    XFORMERS_AVAILABLE = True
-except ImportError:
-    XFORMERS_AVAILABLE = False
-
-from vllm import _custom_ops as ops
-
-logger = init_logger(__name__)
-
-
-class XFormersAttentionBackend(AttentionBackend):
-    accept_output_buffer: bool = True
-    supported_dtypes: ClassVar[list[torch.dtype]] = [torch.float16, torch.bfloat16]
-    supported_kernel_block_sizes: ClassVar[list[int | MultipleOf]] = [MultipleOf(16)]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [
-            32,
-            40,
-            48,
-            56,
-            64,
-            72,
-            80,
-            88,
-            96,
-            104,
-            112,
-            120,
-            128,
-            136,
-            144,
-            152,
-            160,
-            168,
-            176,
-            184,
-            192,
-            200,
-            208,
-            216,
-            224,
-            232,
-            240,
-            248,
-            256,
-        ]
-
-    @staticmethod
-    def get_name() -> str:
-        return "XFORMERS"
-
-    @staticmethod
-    def get_impl_cls() -> type["XFormersAttentionImpl"]:
-        return XFormersAttentionImpl
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        if block_size % 16 != 0:
-            raise ValueError("Block size must be a multiple of 16.")
-        return (2, num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]:
-        return XFormersAttentionMetadataBuilder
-
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
-
-@dataclass
-class XFormersAttentionMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
-    query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
-    slot_mapping: torch.Tensor
-
-    num_prefill_tokens: int = 0
-    num_decode_tokens: int = 0
-    num_prefills: int = 0
-    num_decodes: int = 0
-
-    # Biases for different attention types.
-    attn_bias: Optional["AttentionBias"] = None
-
-    # Self-attention prefill/decode metadata cache
-    _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None
-    _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            # Recover cached prefill-phase attention
-            # metadata structure
-            return self._cached_prefill_metadata
-
-        q_start_loc = self.query_start_loc[self.num_decodes :]
-        q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[self.num_decodes :]
-        # Construct & cache prefill-phase attention metadata structure
-        self._cached_prefill_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_prefill_tokens,
-            max_query_len=int(q_seqlens.max().item()),
-            query_start_loc=q_start_loc - q_start_loc[0],
-            max_seq_len=int(kv_seqlens.max().item()),
-            seq_lens=kv_seqlens,
-            block_table=self.block_table[self.num_decodes :],
-            slot_mapping=self.slot_mapping[self.num_decode_tokens :],
-        )
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            # Recover cached decode-phase attention
-            # metadata structure
-            return self._cached_decode_metadata
-
-        q_start_loc = self.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        decode_kv_seqlens = self.seq_lens[: self.num_decodes]
-        # Construct & cache decode-phase attention metadata structure
-        self._cached_decode_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_decode_tokens,
-            max_query_len=int(q_seqlens[: self.num_decodes].max().item()),
-            query_start_loc=q_start_loc[: self.num_decodes + 1],
-            max_seq_len=int(decode_kv_seqlens.max().item()),
-            seq_lens=decode_kv_seqlens,
-            block_table=self.block_table[: self.num_decodes],
-            slot_mapping=self.slot_mapping[: self.num_decode_tokens],
-            attn_bias=self.attn_bias,
-        )
-        return self._cached_decode_metadata
-
-
-class XFormersAttentionMetadataBuilder(
-    AttentionMetadataBuilder[XFormersAttentionMetadata]
-):
-    reorder_batch_threshold: int = 1
-
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-
-        assert XFORMERS_AVAILABLE
-        self.block_size = kv_cache_spec.block_size
-        self._num_decodes = 0
-        self._num_decode_tokens = 0
-
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> XFormersAttentionMetadata:
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=self.reorder_batch_threshold
-            )
-        )
-
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        q_start_loc = common_attn_metadata.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        max_query_len = common_attn_metadata.max_query_len
-        kv_seqlens = common_attn_metadata.seq_lens
-        max_seq_len = common_attn_metadata.max_seq_len
-        block_table = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-
-        bias = None
-        if num_decodes > 0:
-            # Construct the decoder bias.
-            decode_q_seqlens = q_seqlens[:num_decodes]
-            decode_kv_seqlens = kv_seqlens[:num_decodes]
-            bias = PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                q_seqlen=decode_q_seqlens.tolist(),
-                kv_seqlen=decode_kv_seqlens.tolist(),
-                page_size=self.block_size,
-                block_tables=block_table[:num_decodes],
-                device=block_table.device,
-            )
-
-        return XFormersAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-            num_decodes=num_decodes,
-            max_query_len=max_query_len,
-            query_start_loc=q_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=kv_seqlens,
-            block_table=block_table,
-            slot_mapping=slot_mapping,
-            attn_bias=bias,
-        )
-
-
-class XFormersAttentionImpl(AttentionImpl):
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: list[float] | None,
-        sliding_window: int | None,
-        kv_cache_dtype: str,
-        logits_soft_cap: float | None = None,
-        attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: str | None = None,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if alibi_slopes is not None:
-            raise NotImplementedError("XFormers does not support alibi slopes yet.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.kv_cache_dtype = kv_cache_dtype
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        if sliding_window is None:
-            self.sliding_window = (-1, -1)
-        else:
-            self.sliding_window = (sliding_window - 1, 0)
-        if logits_soft_cap is None:
-            # Setting logits_soft_cap to 0 means no soft cap.
-            logits_soft_cap = 0
-        self.logits_soft_cap = logits_soft_cap
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError(
-                "Encoder self-attention and "
-                "encoder/decoder cross-attention "
-                "are not implemented for "
-                "XFormersAttentionImpl."
-            )
-
-    def forward(
-        self,
-        layer: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: XFormersAttentionMetadata,
-        output: torch.Tensor | None = None,
-        output_scale: torch.Tensor | None = None,
-        output_block_scale: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        """Forward pass with XFormers.
-
-        Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape =
-                [2, num_blocks, block_size, num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        assert output is not None, "Output tensor must be provided."
-
-        if output_scale is not None or output_block_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for XFormersAttentionImpl"
-            )
-
-        if attn_metadata is None:
-            # Profiling run.
-            return output.fill_(0)
-
-        # Cache the input KVs.
-        key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        num_actual_tokens = attn_metadata.num_actual_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        if prefill_meta := attn_metadata.prefill_metadata:
-            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1, key.shape[1])
-            unified_attention(
-                q=query[num_decode_tokens:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[num_decode_tokens:num_actual_tokens],
-                cu_seqlens_q=prefill_meta.query_start_loc,
-                max_seqlen_q=prefill_meta.max_query_len,
-                seqused_k=prefill_meta.seq_lens,
-                max_seqlen_k=prefill_meta.max_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=prefill_meta.block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-            )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Query for decode. KV is not needed because it is already cached.
-            decode_query = query[:num_decode_tokens]
-            # Reshape query to [1, B_T, G, H, D].
-            q = decode_query.view(
-                1, -1, self.num_kv_heads, self.num_queries_per_kv, self.head_size
-            )
-            # Reshape the k and v caches to [1, Bkv_T, G, H, D]
-            cache_k = key_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-            cache_v = value_cache.view(
-                1, -1, self.num_kv_heads, 1, self.head_size
-            ).expand(
-                1,
-                -1,
-                self.num_kv_heads,
-                self.num_queries_per_kv,
-                self.head_size,
-            )
-
-            attn_bias = decode_meta.attn_bias
-            output[:num_decode_tokens] = xops.memory_efficient_attention_forward(
-                q,
-                cache_k,
-                cache_v,
-                attn_bias=attn_bias,
-                p=0.0,
-                scale=self.scale,
-            ).view(decode_query.shape)
-
-        # Reshape the output tensor.
-        return output
diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py
index 55710ad5cc69..8b0e8fd3a241 100644
--- a/vllm/v1/core/block_pool.py
+++ b/vllm/v1/core/block_pool.py
@@ -13,6 +13,8 @@
 from vllm.logger import init_logger
 from vllm.v1.core.kv_cache_utils import (
     BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
     BlockHashWithGroupId,
     ExternalBlockHash,
     FreeKVCacheBlockQueue,
@@ -133,6 +135,10 @@ class BlockPool:
     Args:
         num_gpu_blocks: The number of blocks in the pool.
         enable_caching: Whether to enable prefix caching.
+        hash_block_size: The block size of which the block hashes are computed.
+            The actual block size usually equals hash_block_size, but in cases
+            where different KV cache groups have different block sizes, the
+            actual block size can be a multiple of hash_block_size.
         enable_kv_cache_events: Whether to enable kv cache events.
     """
 
@@ -140,11 +146,13 @@ def __init__(
         self,
         num_gpu_blocks: int,
         enable_caching: bool,
+        hash_block_size: int,
         enable_kv_cache_events: bool = False,
     ):
         assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0
         self.num_gpu_blocks = num_gpu_blocks
         self.enable_caching = enable_caching
+        self.hash_block_size = hash_block_size
         # All kv-cache blocks.
         self.blocks: list[KVCacheBlock] = [
             KVCacheBlock(idx) for idx in range(num_gpu_blocks)
@@ -223,8 +231,20 @@ def cache_full_blocks(
             return
         new_full_blocks = blocks[num_cached_blocks:num_full_blocks]
         assert len(request.block_hashes) >= num_full_blocks
-        new_block_hashes = request.block_hashes[num_cached_blocks:]
+        if block_size == self.hash_block_size:
+            # Common case.
+            block_hashes: BlockHashList = request.block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when
+            # different KV cache groups have different block sizes.
+            assert block_size % self.hash_block_size == 0
+            # Recalculate block_hashes at the granularity of block_size, using
+            # the original block_hashes (at the granularity of hash_block_size).
+            block_hashes = BlockHashListWithBlockSize(
+                request.block_hashes, self.hash_block_size, block_size
+            )
 
+        new_block_hashes = block_hashes[num_cached_blocks:]
         new_hashes: list[ExternalBlockHash] | None = (
             [] if self.enable_kv_cache_events else None
         )
diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py
index 137e5e0cdb6d..fd1ec8e27fba 100644
--- a/vllm/v1/core/kv_cache_coordinator.py
+++ b/vllm/v1/core/kv_cache_coordinator.py
@@ -2,15 +2,25 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
+from math import lcm
 
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import (
+    BlockHash,
+    BlockHashList,
+    BlockHashListWithBlockSize,
+    KVCacheBlock,
+)
 from vllm.v1.core.single_type_kv_cache_manager import (
     CrossAttentionManager,
     FullAttentionManager,
     get_manager_for_kv_cache_spec,
 )
-from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheSpec
+from vllm.v1.kv_cache_interface import (
+    FullAttentionSpec,
+    KVCacheConfig,
+    KVCacheSpec,
+)
 from vllm.v1.request import Request
 
 
@@ -27,13 +37,18 @@ def __init__(
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
+        hash_block_size: int,
     ):
         self.kv_cache_config = kv_cache_config
         self.max_model_len = max_model_len
         self.enable_caching = enable_caching
 
         self.block_pool = BlockPool(
-            kv_cache_config.num_blocks, enable_caching, enable_kv_cache_events
+            kv_cache_config.num_blocks,
+            enable_caching,
+            hash_block_size,
+            enable_kv_cache_events,
         )
 
         # Needs special handling for find_longest_cache_hit if eagle is enabled
@@ -44,6 +59,7 @@ def __init__(
                 block_pool=self.block_pool,
                 kv_cache_group_id=i,
                 dcp_world_size=dcp_world_size,
+                pcp_world_size=pcp_world_size,
             )
             for i, kv_cache_group in enumerate(self.kv_cache_config.kv_cache_groups)
         )
@@ -210,6 +226,8 @@ def __init__(
         use_eagle: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -218,6 +236,8 @@ def __init__(
             False,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.num_single_type_manager = len(self.single_type_managers)
 
@@ -250,6 +270,8 @@ def __init__(
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -258,12 +280,22 @@ def __init__(
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.kv_cache_spec = self.kv_cache_config.kv_cache_groups[0].kv_cache_spec
         self.block_size = self.kv_cache_spec.block_size
         self.dcp_world_size = dcp_world_size
+        self.pcp_world_size = pcp_world_size
         if dcp_world_size > 1:
             self.block_size *= dcp_world_size
+        if pcp_world_size > 1:
+            self.block_size *= pcp_world_size
+        # For models using only Mamba, block_size is set to max_model_len when
+        # prefix caching is disabled, and hash_block_size validation is skipped.
+        assert not enable_caching or (hash_block_size == self.block_size), (
+            "UnitaryKVCacheCoordinator assumes hash_block_size == block_size"
+        )
         assert len(self.kv_cache_config.kv_cache_groups) == 1, (
             "UnitaryKVCacheCoordinator assumes only one kv cache group"
         )
@@ -280,7 +312,9 @@ def find_longest_cache_hit(
             block_pool=self.block_pool,
             kv_cache_spec=self.kv_cache_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.block_size,
             dcp_world_size=self.dcp_world_size,
+            pcp_world_size=self.pcp_world_size,
         )
         return hit_blocks, len(hit_blocks[0]) * self.block_size
 
@@ -302,6 +336,8 @@ def __init__(
         enable_caching: bool,
         enable_kv_cache_events: bool,
         dcp_world_size: int,
+        pcp_world_size: int,
+        hash_block_size: int,
     ):
         super().__init__(
             kv_cache_config,
@@ -310,8 +346,20 @@ def __init__(
             enable_caching,
             enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
+        # hash_block_size: the block size used to compute block hashes.
+        # The actual block size usually equals hash_block_size, but in cases where
+        # different KV cache groups have different block sizes, the actual block size
+        # can be a multiple of hash_block_size.
+        self.hash_block_size = hash_block_size
+        assert all(
+            g.kv_cache_spec.block_size % hash_block_size == 0
+            for g in kv_cache_config.kv_cache_groups
+        ), "block_size must be divisible by hash_block_size"
         assert dcp_world_size == 1, "DCP not support hybrid attn now."
+        assert pcp_world_size == 1, "PCP not support hybrid attn now."
         self.verify_and_split_kv_cache_groups()
 
     def verify_and_split_kv_cache_groups(self) -> None:
@@ -360,14 +408,12 @@ def verify_and_split_kv_cache_groups(self) -> None:
         self.other_spec = other_spec
         self.full_attention_block_size = self.full_attention_spec.block_size
         self.other_block_size = self.other_spec.block_size
-
-        if self.enable_caching:
-            # this requirement is only needed for the prefix caching logic
-            divisible = self.other_block_size % self.full_attention_block_size
-            assert divisible == 0, (
-                "KVCacheCoordinator assumes the block_size of full "
-                "attention layers is divisible by other layers now."
-            )
+        # The LCM of the block sizes of full attention and other attention.
+        # The cache hit length must be a multiple of the LCM of the block sizes
+        # to make sure the cache hit length is a multiple of the block size of
+        # each attention type. Requiring this because we don't support partial
+        # block cache hit yet.
+        self.lcm_block_size = lcm(self.full_attention_block_size, self.other_block_size)
 
         if max(self.full_attention_group_ids) < min(self.other_group_ids):
             self.full_attn_first = True
@@ -401,25 +447,48 @@ def find_longest_cache_hit(
                 - The number of tokens of the longest cache hit.
         """
         # First, find the longest cache hit for full attention.
+        if self.full_attention_spec.block_size == self.hash_block_size:
+            # Common case.
+            full_attention_block_hashes: BlockHashList = block_hashes
+        else:
+            # block_size is a multiple of hash_block_size. This happens when different
+            # KV cache groups have different block sizes. In this case, we need to
+            # recalculate block_hashes at the granularity of block_size, using the
+            # original block_hashes (at the granularity of hash_block_size).
+            full_attention_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.full_attention_spec.block_size
+            )
         hit_blocks_full_attn = self.full_attention_manager_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=full_attention_block_hashes,
             max_length=max_cache_hit_length,
             kv_cache_group_ids=self.full_attention_group_ids,
             block_pool=self.block_pool,
             kv_cache_spec=self.full_attention_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
         )
         hit_length = len(hit_blocks_full_attn[0]) * self.full_attention_block_size
 
         # Next, find the cache hit for the other attention WITHIN
         # the cache hit of full attention.
+        if self.other_spec.block_size == self.hash_block_size:
+            # Common case.
+            other_block_hashes: BlockHashList = block_hashes
+        else:
+            # Similar to the full attention case, here we need to recalculate
+            # block_hashes at the granularity of block_size, using the original
+            # block_hashes (at the granularity of hash_block_size).
+            other_block_hashes = BlockHashListWithBlockSize(
+                block_hashes, self.hash_block_size, self.other_spec.block_size
+            )
         hit_blocks_other_attn = self.other_attention_cls.find_longest_cache_hit(
-            block_hashes=block_hashes,
+            block_hashes=other_block_hashes,
             max_length=hit_length,
             kv_cache_group_ids=self.other_group_ids,
             block_pool=self.block_pool,
             kv_cache_spec=self.other_spec,
             use_eagle=self.use_eagle,
+            alignment_tokens=self.lcm_block_size,
         )
         hit_length = len(hit_blocks_other_attn[0]) * self.other_block_size
 
@@ -452,6 +521,8 @@ def get_kv_cache_coordinator(
     enable_caching: bool,
     enable_kv_cache_events: bool,
     dcp_world_size: int,
+    pcp_world_size: int,
+    hash_block_size: int,
 ) -> KVCacheCoordinator:
     if not enable_caching:
         return KVCacheCoordinatorNoPrefixCache(
@@ -459,7 +530,9 @@ def get_kv_cache_coordinator(
             max_model_len,
             use_eagle,
             enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
         )
     if len(kv_cache_config.kv_cache_groups) == 1:
         return UnitaryKVCacheCoordinator(
@@ -468,7 +541,9 @@ def get_kv_cache_coordinator(
             use_eagle,
             enable_caching,
             enable_kv_cache_events,
-            dcp_world_size=dcp_world_size,
+            dcp_world_size,
+            pcp_world_size,
+            hash_block_size,
         )
     return HybridKVCacheCoordinator(
         kv_cache_config,
@@ -476,5 +551,7 @@ def get_kv_cache_coordinator(
         use_eagle,
         enable_caching,
         enable_kv_cache_events,
-        dcp_world_size=dcp_world_size,
+        dcp_world_size,
+        pcp_world_size,
+        hash_block_size,
     )
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 63a1ff06e404..b061e5cc831d 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -95,42 +95,24 @@ def __init__(
         self,
         kv_cache_config: KVCacheConfig,
         max_model_len: int,
+        hash_block_size: int,
         enable_caching: bool = True,
         use_eagle: bool = False,
         log_stats: bool = False,
         enable_kv_cache_events: bool = False,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> None:
         self.max_model_len = max_model_len
 
         self.enable_caching = enable_caching
         self.use_eagle = use_eagle
         self.log_stats = log_stats
-        # FIXME: make prefix cache stats conditional on log_stats
+        # FIXME: make prefix cache stats conditional on log_stats. We still need
+        # this comment because when the log stats is enabled there are still
+        # potential configs we could expose in the future.
         self.prefix_cache_stats = PrefixCacheStats() if log_stats else None
 
-        self.block_size: int | None = None
-        if self.enable_caching:
-            assert (
-                len(
-                    set(
-                        g.kv_cache_spec.block_size
-                        for g in kv_cache_config.kv_cache_groups
-                    )
-                )
-                == 1
-            ), "Only one block size is supported for now"
-            self.block_size = kv_cache_config.kv_cache_groups[
-                0
-            ].kv_cache_spec.block_size
-
-            if dcp_world_size > 1:
-                assert len(kv_cache_config.kv_cache_groups) == 1
-                # Note(hc): need revisit. When both DCP and any future
-                # PCP are enabled, the block_size may need to be scaled
-                # by a factor of dcp_size × pcp_size?
-                self.block_size *= dcp_world_size
-
         self.coordinator = get_kv_cache_coordinator(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
@@ -138,6 +120,8 @@ def __init__(
             enable_caching=self.enable_caching,
             enable_kv_cache_events=enable_kv_cache_events,
             dcp_world_size=dcp_world_size,
+            pcp_world_size=pcp_world_size,
+            hash_block_size=hash_block_size,
         )
         self.num_kv_cache_groups = len(kv_cache_config.kv_cache_groups)
         self.block_pool = self.coordinator.block_pool
@@ -185,12 +169,11 @@ def get_computed_blocks(self, request: Request) -> tuple[KVCacheBlocks, int]:
                 - A list of blocks that are computed for the request.
                 - The number of computed tokens.
         """
-        # Prefix caching is disabled or
-        # When the request requires prompt logprobs, we skip prefix caching.
-        if not self.enable_caching or (
-            request.sampling_params is not None
-            and request.sampling_params.prompt_logprobs is not None
-        ):
+        # We skip finding the prefix cache hit when prefix caching is
+        # disabled or the request is marked as skipping kv cache read
+        # (which happens when the request requires prompt logprobs
+        # or calls a pooling model with all pooling).
+        if not self.enable_caching or request.skip_reading_prefix_cache:
             return self.empty_kv_cache_blocks, 0
 
         # NOTE: When all tokens hit the cache, we must recompute the last token
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 6e026215d402..602eb81beb01 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -5,9 +5,9 @@
 import copy
 import os
 from collections import defaultdict
-from collections.abc import Callable, Iterable, Sequence
-from dataclasses import dataclass
-from typing import Any, NewType, TypeAlias
+from collections.abc import Callable, Iterable, Iterator, Sequence
+from dataclasses import dataclass, replace
+from typing import Any, NewType, TypeAlias, overload
 
 from vllm import envs
 from vllm.config import VllmConfig
@@ -825,11 +825,11 @@ def get_num_blocks(
     return num_blocks
 
 
-def get_uniform_page_size(kv_cache_spec: dict[str, KVCacheSpec]) -> int:
+def get_uniform_page_size(kv_cache_specs: Iterable[KVCacheSpec]) -> int:
     """
     Get the page size of the KV cache.
     """
-    page_sizes = set(layer.page_size_bytes for layer in kv_cache_spec.values())
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_specs}
     assert len(page_sizes) == 1
     return page_sizes.pop()
 
@@ -882,6 +882,46 @@ def is_kv_cache_page_size_uniform(kv_cache_spec: dict[str, KVCacheSpec]) -> bool
     return len(page_sizes) == 1
 
 
+def unify_kv_cache_spec_page_size(
+    kv_cache_spec: dict[str, KVCacheSpec],
+) -> dict[str, KVCacheSpec]:
+    """
+    Unify the page size of the given KVCacheSpec. If the page size of all layers
+    are the same, return the original KVCacheSpec. If not same, unify the page
+    size by increasing the block size of layers with smaller page size. Raise
+    NotImplementedError if failed to unify the page size.
+
+    Args:
+        kv_cache_spec: The KVCacheSpec of each attention layer in the model
+
+    Returns:
+        The updated KVCacheSpec with the same page_size_bytes.
+    """
+    page_sizes = {layer.page_size_bytes for layer in kv_cache_spec.values()}
+    if len(page_sizes) <= 1:
+        # All layers have the same page size, no need to unify.
+        return kv_cache_spec
+
+    max_page_size = max(page_sizes)
+    new_kv_cache_spec = {}
+    for layer_name, layer_spec in kv_cache_spec.items():
+        if layer_spec.page_size_bytes == max_page_size:
+            new_kv_cache_spec[layer_name] = layer_spec
+        else:
+            layer_page_size = layer_spec.page_size_bytes
+            if max_page_size % layer_page_size != 0:
+                raise NotImplementedError(
+                    "The page size of the layer is not divisible by the "
+                    "maximum page size. Cannot unify by adjusting block_size."
+                )
+            ratio = max_page_size // layer_page_size
+            new_block_size = layer_spec.block_size * ratio
+            new_spec = replace(layer_spec, block_size=new_block_size)
+            assert new_spec.page_size_bytes == max_page_size
+            new_kv_cache_spec[layer_name] = new_spec
+    return new_kv_cache_spec
+
+
 def is_kv_cache_type_attention_free(kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
     # kv_cache_spec is an empty dict for attention free models
     return not kv_cache_spec
@@ -971,7 +1011,16 @@ def _get_kv_cache_groups_uniform_page_size(
     # is the minimum number of layers among all attention types. Need a better
     # strategy if we want to support more complex patterns (e.g., 20 full + 30
     # sw, where the group size should be 10).
-    group_size = min([len(layers) for layers in same_type_layers.values()])
+    min_num_layers = min([len(layers) for layers in same_type_layers.values()])
+    group_size = min_num_layers
+    max_num_layers = max([len(layers) for layers in same_type_layers.values()])
+    if max_num_layers < min_num_layers * 1.25:
+        # If the number of layers is not much larger than the minimum number of layers,
+        # use the maximum number of layers as the group size to avoid too many padding
+        # layers. A typical example is gpt-oss-20b + eagle, with 12 sw + 13 full. We
+        # pad it to (13 sw, 13 full) instead of (12 sw, 24 full). 1.25 is just a
+        # magic number to avoid too many padding layers.
+        group_size = max_num_layers
     grouped_layers = []
     for layers in same_type_layers.values():
         num_padding_layers = group_size - len(layers) % group_size
@@ -1001,7 +1050,6 @@ def _get_kv_cache_groups_uniform_page_size(
 def get_kv_cache_config_from_groups(
     vllm_config: VllmConfig,
     kv_cache_groups: list[KVCacheGroupSpec],
-    kv_cache_specs: dict[str, KVCacheSpec],
     available_memory: int,
 ) -> KVCacheConfig:
     """
@@ -1011,7 +1059,6 @@ def get_kv_cache_config_from_groups(
     Args:
         vllm_config: The global VllmConfig
         kv_cache_groups: The KV cache groups
-        kv_cache_specs: The KV cache spec of each attention layer in the model
         available_memory: Memory available for KV cache in bytes
     Returns:
         The generated KVCacheConfig
@@ -1055,7 +1102,9 @@ def get_kv_cache_config_from_groups(
         # full.1, sw.2: share another Tensor with size=available_memory//2
         group_size = max(len(group.layer_names) for group in kv_cache_groups)
 
-        page_size = get_uniform_page_size(kv_cache_specs)
+        page_size = get_uniform_page_size(
+            [group.kv_cache_spec for group in kv_cache_groups]
+        )
         assert group_size > 0, "group_size must be greater than 0"
         num_blocks = get_num_blocks(
             vllm_config, group_size, available_memory, page_size
@@ -1157,7 +1206,8 @@ def get_kv_cache_groups(
         # This returns an empty list to allow for the KVCacheManager to handle
         # attention free models.
         return []
-    elif is_kv_cache_spec_uniform(kv_cache_spec):
+
+    if is_kv_cache_spec_uniform(kv_cache_spec):
         # KV cache of all layers are the same, which is true for
         # most models. Allocate the same amount of memory for
         # each layer.
@@ -1167,14 +1217,16 @@ def get_kv_cache_groups(
         # full attention, or all layers are sliding window attention with the
         # same window size). Put all layers into one group.
         return _get_kv_cache_groups_uniform_type(uniform_spec)
-    elif is_kv_cache_page_size_uniform(kv_cache_spec):
-        # Model contains multiple attention types, but KV cache of all layers
-        # have the same physical memory per block per layer. Split the layers
-        # into groups with the same number of layers, and thus same total page
-        # size.
-        return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 
-    raise NotImplementedError
+    # As KVCacheManager can only allocate memory of one size, we need to unify
+    # the page size of the layers. For cases cannot be unified, this function
+    # will raise an error.
+    kv_cache_spec = unify_kv_cache_spec_page_size(kv_cache_spec)
+    # Model contains multiple attention types, but KV cache of all layers
+    # have the same physical memory per block per layer. Split the layers
+    # into groups with the same number of layers, and thus same total page
+    # size.
+    return _get_kv_cache_groups_uniform_page_size(kv_cache_spec)
 
 
 def generate_scheduler_kv_cache_config(
@@ -1219,11 +1271,16 @@ def _report_kv_cache_config(
         // len(kv_cache_config.kv_cache_groups)
         * min_block_size
     )
-    if vllm_config.parallel_config.decode_context_parallel_size > 1:
-        num_tokens *= vllm_config.parallel_config.decode_context_parallel_size
+    dcp_size = vllm_config.parallel_config.decode_context_parallel_size
+    pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
+    if pcp_size * dcp_size > 1:
+        num_tokens *= pcp_size * dcp_size
         logger.info(
-            "Multiplying the GPU KV cache size by the dcp_world_size %d.",
-            vllm_config.parallel_config.decode_context_parallel_size,
+            "Multiplying the GPU KV cache size by the cp_world_size %d "
+            "(pcp_world_size %d * dcp_world_size %d).",
+            pcp_size * dcp_size,
+            pcp_size,
+            dcp_size,
         )
     num_tokens_str = f"{num_tokens:,}"
     logger.info_once("GPU KV cache size: %s tokens", num_tokens_str, scope="local")
@@ -1231,10 +1288,11 @@ def _report_kv_cache_config(
     max_concurrency = get_max_concurrency_for_kv_cache_config(
         vllm_config, kv_cache_config
     )
-    logger.info(
+    logger.info_once(
         "Maximum concurrency for %s tokens per request: %.2fx",
         max_model_len_str,
         max_concurrency,
+        scope="local",
     )
 
 
@@ -1312,10 +1370,7 @@ def get_kv_cache_configs(
         ) == len(kv_cache_spec_one_worker), "Some layers are not assigned to any group."
         kv_cache_configs.append(
             get_kv_cache_config_from_groups(
-                vllm_config,
-                kv_cache_groups_one_worker,
-                kv_cache_spec_one_worker,
-                available_memory_one_worker,
+                vllm_config, kv_cache_groups_one_worker, available_memory_one_worker
             )
         )
 
@@ -1338,3 +1393,79 @@ def get_kv_cache_configs(
             _report_kv_cache_config(vllm_config, kv_cache_config)
 
     return kv_cache_configs
+
+
+class BlockHashListWithBlockSize:
+    """
+    Convert block-hash granularity from `hash_block_size` to `target_block_size`.
+    Used when KV cache groups have different block sizes: `hash_block_size`
+    is the size used to compute the original `block_hashes`; `target_block_size`
+    is the group's actual block size.
+
+    Currently, only scaling up by an integer factor is supported (i.e.,
+    `target_block_size` is a multiple of `hash_block_size`). Conversion is
+    performed lazily on access for efficiency, by concatenating consecutive
+    hashes at `hash_block_size` to form each hash at `target_block_size`.
+
+    Example (`hash_block_size` = 16, `target_block_size` = 32):
+    concatenating two 16-size hashes yields one 32-size hash:
+
+    Block hashes with block_size 16:
+    | Token Range | 0-15 | 16-31 | 32-47 | 48-63 |
+    |-------------|------|-------|-------|-------|
+    | Hash        | A    | B     | C     | D     |
+
+    Block hashes with block_size 32:
+    | Token Range | 0-31 | 32-63 |
+    |-------------|------|-------|
+    | Hash        | AB   | CD    |
+
+    Args:
+        block_hashes: Block hashes to convert, computed at `hash_block_size`.
+        hash_block_size: Block size at which `block_hashes` were computed.
+        target_block_size: Desired block size; must be a multiple of `hash_block_size`.
+    """
+
+    def __init__(
+        self,
+        block_hashes: list[BlockHash],
+        hash_block_size: int,
+        target_block_size: int,
+    ):
+        self.block_hashes = block_hashes
+        assert target_block_size % hash_block_size == 0
+        self.scale_factor = target_block_size // hash_block_size
+
+    def __len__(self) -> int:
+        return len(self.block_hashes) // self.scale_factor
+
+    @overload
+    def __getitem__(self, idx: int) -> BlockHash: ...
+
+    @overload
+    def __getitem__(self, idx: slice) -> list[BlockHash]: ...
+
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            return self._get_value_at(idx)
+
+        if isinstance(idx, slice):
+            start, stop, step = idx.indices(len(self))
+            return [self._get_value_at(i) for i in range(start, stop, step)]
+
+        raise TypeError(f"Invalid index type: {type(idx)!r}")
+
+    def __iter__(self) -> Iterator[BlockHash]:
+        for i in range(len(self)):
+            yield self._get_value_at(i)
+
+    def _get_value_at(self, idx: int) -> BlockHash:
+        base = idx * self.scale_factor
+        end = base + self.scale_factor
+        merged_hash: bytes = self.block_hashes[base]
+        for i in range(base + 1, end):
+            merged_hash += self.block_hashes[i]
+        return BlockHash(merged_hash)
+
+
+BlockHashList = list[BlockHash] | BlockHashListWithBlockSize
diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py
index 0ad994c360b0..3214f65a0972 100644
--- a/vllm/v1/core/sched/async_scheduler.py
+++ b/vllm/v1/core/sched/async_scheduler.py
@@ -16,18 +16,25 @@ def _update_after_schedule(
     ) -> None:
         super()._update_after_schedule(scheduler_output)
         pending_structured_output_tokens = False
+        spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens
         for req_id in scheduler_output.num_scheduled_tokens:
             request = self.requests[req_id]
             pending_structured_output_tokens |= (
                 request.use_structured_output and request.num_output_placeholders > 0
             )
+            cur_num_spec_tokens = len(spec_decode_tokens.get(req_id, ()))
             if (
                 request.num_computed_tokens
-                == request.num_tokens + request.num_output_placeholders
+                == request.num_tokens
+                + request.num_output_placeholders
+                + cur_num_spec_tokens
             ):
-                # The request will generate a new token in this scheduling step.
-                # TODO(woosuk): Support speculative decoding.
-                request.num_output_placeholders += 1
+                # The request will generate a new token plus num_spec_tokens
+                # in this scheduling step.
+                request.num_output_placeholders += 1 + cur_num_spec_tokens
+                # Add placeholders for the new tokens in spec_token_ids.
+                # Wwe will update the actual spec token ids in the worker process.
+                request.spec_token_ids = [-1] * self.num_spec_tokens
 
         scheduler_output.pending_structured_output_tokens = (
             pending_structured_output_tokens
diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py
index 20fdb3446404..b69fa87ebddc 100644
--- a/vllm/v1/core/sched/output.py
+++ b/vllm/v1/core/sched/output.py
@@ -44,11 +44,15 @@ class NewRequestData:
     lora_request: LoRARequest | None
     prompt_embeds: "torch.Tensor | None" = None
 
+    # Only used for v2 model runner.
+    prefill_token_ids: list[int] | None = None
+
     @classmethod
     def from_request(
         cls,
         request: Request,
         block_ids: tuple[list[int], ...],
+        prefill_token_ids: list[int] | None = None,
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
@@ -60,14 +64,18 @@ def from_request(
             num_computed_tokens=request.num_computed_tokens,
             lora_request=request.lora_request,
             prompt_embeds=request.prompt_embeds,
+            prefill_token_ids=prefill_token_ids,
         )
 
     def __repr__(self) -> str:
-        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        prompt_embeds_shape = (
+            self.prompt_embeds.shape if self.prompt_embeds is not None else None
+        )
         return (
             f"NewRequestData("
             f"req_id={self.req_id},"
             f"prompt_token_ids={self.prompt_token_ids},"
+            f"prefill_token_ids={self.prefill_token_ids},"
             f"mm_features={self.mm_features},"
             f"sampling_params={self.sampling_params},"
             f"block_ids={self.block_ids},"
@@ -82,7 +90,9 @@ def anon_repr(self) -> str:
         prompt_token_ids_len = (
             len(self.prompt_token_ids) if self.prompt_token_ids is not None else None
         )
-        prompt_embeds_shape = self.prompt_embeds.shape if self.prompt_embeds else None
+        prompt_embeds_shape = (
+            self.prompt_embeds.shape if self.prompt_embeds is not None else None
+        )
         return (
             f"NewRequestData("
             f"req_id={self.req_id},"
@@ -120,12 +130,12 @@ def num_reqs(self) -> int:
         return len(self.req_ids)
 
     @cached_property
-    @deprecated("use resumed_req_ids field")
+    @deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.")
     def resumed_from_preemption(self) -> list[bool]:
         return [req_id in self.resumed_req_ids for req_id in self.req_ids]
 
     @cached_property
-    @deprecated("use all_token_ids field")
+    @deprecated("This will be removed in v0.14, use `all_token_ids` instead.")
     def resumed_req_token_ids(self) -> list[list[int] | None]:
         return [
             self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None
@@ -183,6 +193,10 @@ class SchedulerOutput:
     # freed from the encoder cache.
     free_encoder_mm_hashes: list[str]
 
+    # Request IDs that are preempted in this step.
+    # Only used for v2 model runner.
+    preempted_req_ids: set[str] | None = None
+
     # Whether the scheduled requests have all the output tokens they
     # need to perform grammar bitmask computation.
     pending_structured_output_tokens: bool = False
@@ -193,6 +207,20 @@ class SchedulerOutput:
     # EC Cache Connector metadata
     ec_connector_metadata: ECConnectorMetadata | None = None
 
+    @classmethod
+    def make_empty(cls) -> "SchedulerOutput":
+        return cls(
+            scheduled_new_reqs=[],
+            scheduled_cached_reqs=CachedRequestData.make_empty(),
+            num_scheduled_tokens={},
+            total_num_scheduled_tokens=0,
+            scheduled_spec_decode_tokens={},
+            scheduled_encoder_inputs={},
+            num_common_prefix_blocks=[],
+            finished_req_ids=set(),
+            free_encoder_mm_hashes=[],
+        )
+
 
 @dataclass
 class GrammarOutput:
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index 4fcc7955df19..4e38b991326d 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -6,6 +6,7 @@
 from collections.abc import Iterable
 from typing import Any
 
+from vllm import envs
 from vllm.config import VllmConfig
 from vllm.distributed.ec_transfer.ec_connector.base import (
     ECConnectorMetadata,
@@ -83,7 +84,7 @@ def __init__(
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
         self.max_num_scheduled_tokens = self.scheduler_config.max_num_batched_tokens
-        self.max_model_len = self.scheduler_config.max_model_len
+        self.max_model_len = vllm_config.model_config.max_model_len
         self.enable_kv_cache_events = (
             self.kv_events_config is not None
             and self.kv_events_config.enable_kv_cache_events
@@ -121,6 +122,7 @@ def __init__(
 
         self.block_size = block_size
         self.dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
+        self.pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
 
         # req_id -> Request
         self.requests: dict[str, Request] = {}
@@ -178,13 +180,16 @@ def __init__(
         self.kv_cache_manager = KVCacheManager(
             kv_cache_config=kv_cache_config,
             max_model_len=self.max_model_len,
-            enable_caching=bool(self.cache_config.enable_prefix_caching),
+            enable_caching=self.cache_config.enable_prefix_caching,
             use_eagle=self.use_eagle,
             log_stats=self.log_stats,
             enable_kv_cache_events=self.enable_kv_cache_events,
             dcp_world_size=self.dcp_world_size,
+            pcp_world_size=self.pcp_world_size,
+            hash_block_size=self.block_size,
         )
         self.use_pp = self.parallel_config.pipeline_parallel_size > 1
+        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
 
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
@@ -229,11 +234,15 @@ def schedule(self) -> SchedulerOutput:
                 num_new_tokens = self.scheduler_config.long_prefill_token_threshold
             num_new_tokens = min(num_new_tokens, token_budget)
 
-            # Make sure the input position does not exceed the max model len or
-            # request's max_tokens.
-            # This is necessary when using spec decoding and/or async scheduling.
+            num_spec_placeholders = max(0, request.num_output_placeholders - 1)
             max_total_tokens = min(
-                request.num_prompt_tokens + request.max_tokens, self.max_model_len
+                # Avoid scheduling tokens that we're sure won't will be needed based on
+                # request.max_tokens. For this calculation we assume placeholder
+                # speculated output tokens are rejected.
+                request.num_prompt_tokens + request.max_tokens + num_spec_placeholders,
+                # Make sure the input position does not exceed the max model len.
+                # This is necessary when using spec decoding.
+                self.max_model_len,
             )
             num_new_tokens = min(
                 num_new_tokens, max_total_tokens - 1 - request.num_computed_tokens
@@ -254,6 +263,7 @@ def schedule(self) -> SchedulerOutput:
                     request.num_computed_tokens,
                     num_new_tokens,
                     encoder_compute_budget,
+                    shift_computed_tokens=1 if self.use_eagle else 0,
                 )
 
             if num_new_tokens == 0:
@@ -348,7 +358,10 @@ def schedule(self) -> SchedulerOutput:
             # Speculative decode related.
             if request.spec_token_ids:
                 num_scheduled_spec_tokens = (
-                    num_new_tokens + request.num_computed_tokens - request.num_tokens
+                    num_new_tokens
+                    + request.num_computed_tokens
+                    - request.num_tokens
+                    - request.num_output_placeholders
                 )
                 if num_scheduled_spec_tokens > 0:
                     # Trim spec_token_ids list to num_scheduled_spec_tokens.
@@ -463,15 +476,16 @@ def schedule(self) -> SchedulerOutput:
                             skipped_waiting_requests.prepend_request(request)
                             continue
 
+                        request.num_external_computed_tokens = ext_tokens
                         num_external_computed_tokens = ext_tokens
 
                     # Total computed tokens (local + external).
                     num_computed_tokens = (
                         num_new_local_computed_tokens + num_external_computed_tokens
                     )
-                # KVTransfer: WAITING reqs have num_computed_tokens > 0
-                # after async KV recvs are completed.
                 else:
+                    # KVTransfer: WAITING reqs have num_computed_tokens > 0
+                    # after async KV recvs are completed.
                     new_computed_blocks = self.kv_cache_manager.empty_kv_cache_blocks
                     num_new_local_computed_tokens = 0
                     num_computed_tokens = request.num_computed_tokens
@@ -480,12 +494,12 @@ def schedule(self) -> SchedulerOutput:
                 external_load_encoder_input = []
                 new_encoder_compute_budget = encoder_compute_budget
 
-                # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
+                    # KVTransfer: loading remote KV, do not allocate for new work.
                     assert num_external_computed_tokens > 0
                     num_new_tokens = 0
-                # Number of tokens to be scheduled.
                 else:
+                    # Number of tokens to be scheduled.
                     # We use `request.num_tokens` instead of
                     # `request.num_prompt_tokens` to consider the resumed
                     # requests, which have output tokens.
@@ -497,12 +511,12 @@ def schedule(self) -> SchedulerOutput:
                     # chunked prefill has to be enabled explicitly to allow
                     # pooling requests to be chunked
                     if (
-                        not self.scheduler_config.chunked_prefill_enabled
+                        not self.scheduler_config.enable_chunked_prefill
                         and num_new_tokens > token_budget
                     ):
-                        self.waiting.pop_request()
-                        skipped_waiting_requests.prepend_request(request)
-                        continue
+                        # If chunked_prefill is disabled,
+                        # we can stop the scheduling here.
+                        break
 
                     num_new_tokens = min(num_new_tokens, token_budget)
                     assert num_new_tokens > 0
@@ -519,6 +533,7 @@ def schedule(self) -> SchedulerOutput:
                             num_computed_tokens,
                             num_new_tokens,
                             encoder_compute_budget,
+                            shift_computed_tokens=1 if self.use_eagle else 0,
                         )
                         if num_new_tokens == 0:
                             # The request cannot be scheduled.
@@ -569,9 +584,6 @@ def schedule(self) -> SchedulerOutput:
                         new_computed_blocks + new_blocks,
                         num_external_computed_tokens,
                     )
-                    self._update_connector_prefix_cache_stats(
-                        request, num_external_computed_tokens
-                    )
 
                 # Request was already popped from self.waiting
                 # unless it was re-added above due to new_blocks being None.
@@ -583,6 +595,8 @@ def schedule(self) -> SchedulerOutput:
                     request.status = RequestStatus.WAITING_FOR_REMOTE_KVS
                     continue
 
+                self._update_connector_prefix_cache_stats(request)
+
                 req_index += 1
                 self.running.append(request)
                 if self.log_stats:
@@ -653,12 +667,25 @@ def schedule(self) -> SchedulerOutput:
                 )
 
         # Construct the scheduler output.
-        new_reqs_data = [
-            NewRequestData.from_request(
-                req, req_to_new_blocks[req.request_id].get_block_ids()
-            )
-            for req in scheduled_new_reqs
-        ]
+        if self.use_v2_model_runner:
+            scheduled_new_reqs = scheduled_new_reqs + scheduled_resumed_reqs
+            scheduled_resumed_reqs = []
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req,
+                    req_to_new_blocks[req.request_id].get_block_ids(),
+                    req._all_token_ids,
+                )
+                for req in scheduled_new_reqs
+            ]
+        else:
+            new_reqs_data = [
+                NewRequestData.from_request(
+                    req, req_to_new_blocks[req.request_id].get_block_ids()
+                )
+                for req in scheduled_new_reqs
+            ]
+
         with record_function_or_nullcontext("schedule: make_cached_request_data"):
             cached_reqs_data = self._make_cached_request_data(
                 scheduled_running_reqs,
@@ -680,6 +707,7 @@ def schedule(self) -> SchedulerOutput:
             scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
             scheduled_encoder_inputs=scheduled_encoder_inputs,
             num_common_prefix_blocks=num_common_prefix_blocks,
+            preempted_req_ids={req.request_id for req in preempted_reqs},
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
@@ -778,9 +806,7 @@ def _make_cached_request_data(
                 assert not scheduled_in_prev_step
                 resumed_req_ids.add(req_id)
             if not scheduled_in_prev_step:
-                all_token_ids[req_id] = req.all_token_ids[
-                    : req.num_computed_tokens + num_tokens
-                ]
+                all_token_ids[req_id] = req.all_token_ids.copy()
             new_block_ids.append(
                 req_to_new_blocks[req_id].get_block_ids(allow_none=True)
             )
@@ -805,6 +831,7 @@ def _try_schedule_encoder_inputs(
         num_computed_tokens: int,
         num_new_tokens: int,
         encoder_compute_budget: int,
+        shift_computed_tokens: int = 0,
     ) -> tuple[list[int], int, int, list[int]]:
         """
         Determine which encoder inputs need to be scheduled in the current step,
@@ -849,7 +876,10 @@ def _try_schedule_encoder_inputs(
             # The encoder output is needed if the two ranges overlap:
             # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
             # [start_pos, start_pos + num_encoder_tokens)
-            if start_pos >= num_computed_tokens + num_new_tokens:
+            if (
+                start_pos
+                >= num_computed_tokens + num_new_tokens + shift_computed_tokens
+            ):
                 # The encoder input is not needed in this step.
                 break
 
@@ -905,10 +935,12 @@ def _try_schedule_encoder_inputs(
                 # NOTE(woosuk): We assume that the encoder input tokens should
                 # be processed altogether, as the encoder usually uses
                 # bidirectional attention.
-                if num_computed_tokens < start_pos:
+                if num_computed_tokens + shift_computed_tokens < start_pos:
                     # We only schedule the decoder tokens just before the
                     # encoder input.
-                    num_new_tokens = start_pos - num_computed_tokens
+                    num_new_tokens = start_pos - (
+                        num_computed_tokens + shift_computed_tokens
+                    )
                 else:
                     # Because of prefix caching, num_computed_tokens is greater
                     # than start_pos even though its encoder input is not
@@ -1026,7 +1058,12 @@ def update_from_output(
                 # tokens and rejections. If some tokens are rejected,
                 # num_computed_tokens is decreased by the number of rejected
                 # tokens.
-                request.num_computed_tokens -= num_rejected
+                if request.num_computed_tokens > 0:
+                    request.num_computed_tokens -= num_rejected
+                # If async scheduling, num_output_placeholders also includes
+                # the scheduled spec tokens count and so is similarly adjusted.
+                if request.num_output_placeholders > 0:
+                    request.num_output_placeholders -= num_rejected
                 spec_decoding_stats = self.make_spec_decoding_stats(
                     spec_decoding_stats,
                     num_draft_tokens=num_draft_tokens,
@@ -1064,9 +1101,7 @@ def update_from_output(
                 and request.sampling_params.logprobs is not None
                 and logprobs
             ):
-                # NOTE: once we support N tokens per step (spec decode),
-                # the outer lists can be of length > 1.
-                new_logprobs = logprobs.slice(req_index, req_index + 1)
+                new_logprobs = logprobs.slice_request(req_index, len(new_token_ids))
 
             if new_token_ids and self.structured_output_manager.should_advance(request):
                 struct_output_request = request.structured_output_request
@@ -1356,15 +1391,13 @@ def shutdown(self) -> None:
     # KV Connector Related Methods
     ########################################################################
 
-    def _update_connector_prefix_cache_stats(
-        self, request: Request, num_external_tokens: int
-    ) -> None:
+    def _update_connector_prefix_cache_stats(self, request: Request) -> None:
         if self.connector_prefix_cache_stats is None:
             return
 
         self.connector_prefix_cache_stats.record(
             num_tokens=request.num_tokens,
-            num_hits=num_external_tokens,
+            num_hits=request.num_external_computed_tokens,
             preempted=request.num_preemptions > 0,
         )
 
@@ -1547,9 +1580,11 @@ def _update_requests_with_invalid_blocks(
                 marked_invalid_block = True
                 # Truncate the computed tokens at the first failed block
                 request.num_computed_tokens = idx * self.block_size
-                total_affected_tokens += (
+                num_affected_tokens = (
                     req_num_computed_tokens - request.num_computed_tokens
                 )
+                total_affected_tokens += num_affected_tokens
+                request.num_external_computed_tokens -= num_affected_tokens
 
             if is_affected:
                 if not marked_invalid_block:
diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
index 14ac83028ee4..4aeb17a156bb 100644
--- a/vllm/v1/core/single_type_kv_cache_manager.py
+++ b/vllm/v1/core/single_type_kv_cache_manager.py
@@ -7,7 +7,7 @@
 
 from vllm.utils.math_utils import cdiv
 from vllm.v1.core.block_pool import BlockPool
-from vllm.v1.core.kv_cache_utils import BlockHash, KVCacheBlock
+from vllm.v1.core.kv_cache_utils import BlockHashList, KVCacheBlock
 from vllm.v1.kv_cache_interface import (
     ChunkedLocalAttentionSpec,
     CrossAttentionSpec,
@@ -32,6 +32,7 @@ def __init__(
         block_pool: BlockPool,
         kv_cache_group_id: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> None:
         """
         Initializes the SingleTypeKVCacheManager.
@@ -42,8 +43,9 @@ def __init__(
         """
         self.block_size = kv_cache_spec.block_size
         self.dcp_world_size = dcp_world_size
-        if self.dcp_world_size > 1:
-            self.block_size *= dcp_world_size
+        self.pcp_world_size = pcp_world_size
+        if dcp_world_size * pcp_world_size > 1:
+            self.block_size *= dcp_world_size * pcp_world_size
         self.kv_cache_spec = kv_cache_spec
         self.block_pool = block_pool
 
@@ -205,13 +207,15 @@ def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
     @abstractmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         Get the longest cache hit prefix of the blocks that is not longer than
@@ -229,6 +233,11 @@ def find_longest_cache_hit(
             block_pool: The block pool.
             kv_cache_spec: The kv cache spec.
             use_eagle: Whether to use eagle.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens). By default, it should
+                be set to the block_size.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.
 
         Returns:
             A list of cached blocks with skipped blocks replaced by null block
@@ -296,16 +305,18 @@ class FullAttentionManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(
-            kv_cache_spec, (FullAttentionSpec, ChunkedLocalAttentionSpec)
+            kv_cache_spec, FullAttentionSpec | ChunkedLocalAttentionSpec
         ), (
             "FullAttentionManager can only be used for full attention "
             "and chunked local attention groups"
@@ -314,8 +325,8 @@ def find_longest_cache_hit(
             [] for _ in range(len(kv_cache_group_ids))
         )
         block_size = kv_cache_spec.block_size
-        if dcp_world_size > 1:
-            block_size *= dcp_world_size
+        if dcp_world_size * pcp_world_size > 1:
+            block_size *= dcp_world_size * pcp_world_size
         max_num_blocks = max_length // block_size
         for block_hash in itertools.islice(block_hashes, max_num_blocks):
             # block_hashes is a chain of block hashes. If a block hash is not
@@ -329,6 +340,13 @@ def find_longest_cache_hit(
             else:
                 break
         if use_eagle and computed_blocks[0]:
+            # Need to drop the last matched block if eagle is enabled.
+            for computed in computed_blocks:
+                computed.pop()
+        while (
+            block_size != alignment_tokens  # Faster for common case.
+            and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+        ):
             for computed in computed_blocks:
                 computed.pop()
         return computed_blocks
@@ -355,18 +373,21 @@ def __init__(
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, SlidingWindowSpec), (
             "SlidingWindowManager can only be used for sliding window groups"
         )
         assert dcp_world_size == 1, "DCP not support sliding window attn now."
+        assert pcp_world_size == 1, "PCP not support sliding window attn now."
 
         # The number of contiguous blocks needed for prefix cache hit.
         # -1 since the input token itself is also included in the window
@@ -390,6 +411,7 @@ def find_longest_cache_hit(
             [block_pool.null_block] * max_num_blocks
             for _ in range(len(kv_cache_group_ids))
         )
+        block_size = kv_cache_spec.block_size
         num_contiguous_blocks = 0
         match_found = False
         # Search from right to left and early stop when a match is found.
@@ -397,6 +419,15 @@ def find_longest_cache_hit(
             if cached_block := block_pool.get_cached_block(
                 block_hashes[i], kv_cache_group_ids
             ):
+                # Skip prefix matching check if the block is not aligned with
+                # `alignment_tokens`.
+                if (
+                    num_contiguous_blocks == 0
+                    and block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
+                # Add the cached block to the computed blocks.
                 for computed, cached in zip(computed_blocks, cached_block):
                     computed[i] = cached
                 num_contiguous_blocks += 1
@@ -415,7 +446,16 @@ def find_longest_cache_hit(
             # `num_contiguous_blocks < sliding_window_contiguous_blocks`.
             for computed in computed_blocks:
                 del computed[num_contiguous_blocks:]
+            while (
+                block_size != alignment_tokens  # Faster for common case.
+                and len(computed_blocks[0]) * block_size % alignment_tokens != 0
+            ):
+                for computed in computed_blocks:
+                    computed.pop()
         if use_eagle and computed_blocks[0]:
+            assert kv_cache_spec.block_size == alignment_tokens, (
+                "aligned_length is not compatible with eagle now"
+            )
             for computed in computed_blocks:
                 computed.pop()
         return computed_blocks
@@ -469,13 +509,15 @@ def __init__(
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         """
         For chunked local attention, we need to find the longest cache hit
@@ -504,6 +546,10 @@ def find_longest_cache_hit(
             block_pool: The block pool.
             kv_cache_spec: The kv cache spec.
             use_eagle: Whether to use eagle.
+            dcp_world_size: The world size of decode context parallelism.
+            pcp_world_size: The world size of prefill context parallelism.
+            alignment_tokens: The returned cache hit length (in tokens) should
+                be a multiple of this value (in tokens).
 
         Returns:
             A list of cached blocks
@@ -516,6 +562,11 @@ def find_longest_cache_hit(
             "Hybrid KV cache is not supported for " + "eagle + chunked local attention."
         )
         assert dcp_world_size == 1, "DCP not support chunked local attn now."
+        assert pcp_world_size == 1, "PCP not support chunked local attn now."
+        assert kv_cache_spec.block_size == alignment_tokens, (
+            "KV cache groups with different block sizes are not compatible with "
+            "chunked local attention now"
+        )
         max_num_blocks = max_length // kv_cache_spec.block_size
         if max_length > 0:
             local_attention_start_idx = (
@@ -604,28 +655,40 @@ class MambaManager(SingleTypeKVCacheManager):
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, MambaSpec), (
             "MambaManager can only be used for mamba groups"
         )
         assert dcp_world_size == 1, "DCP not support mamba now."
+        assert pcp_world_size == 1, "PCP not support mamba now."
         computed_blocks: tuple[list[KVCacheBlock], ...] = tuple(
             [] for _ in range(len(kv_cache_group_ids))
         )
 
-        max_num_blocks = max_length // kv_cache_spec.block_size
+        block_size = kv_cache_spec.block_size
+        max_num_blocks = max_length // block_size
         # Search from right to left and early stop when a match is found.
         for i in range(max_num_blocks - 1, -1, -1):
             if cached_block := block_pool.get_cached_block(
                 block_hashes[i], kv_cache_group_ids
             ):
+                # When enable Mamba prefix caching, `block_size` will be aligned
+                # across full attention layers and Mamba layers to ensure the
+                # prefix hit length aligned at block
+                if (
+                    block_size != alignment_tokens  # Faster for common case.
+                    and (i + 1) * block_size % alignment_tokens != 0
+                ):
+                    continue
                 for computed, cached in zip(computed_blocks, cached_block):
                     # the hit length logic later assumes:
                     #  hit_length = len(hit_blocks_other_attn[0])
@@ -698,13 +761,15 @@ def get_num_common_prefix_blocks(self, running_request_id: str) -> int:
     @classmethod
     def find_longest_cache_hit(
         cls,
-        block_hashes: list[BlockHash],
+        block_hashes: BlockHashList,
         max_length: int,
         kv_cache_group_ids: list[int],
         block_pool: BlockPool,
         kv_cache_spec: KVCacheSpec,
         use_eagle: bool,
+        alignment_tokens: int,
         dcp_world_size: int = 1,
+        pcp_world_size: int = 1,
     ) -> tuple[list[KVCacheBlock], ...]:
         assert isinstance(kv_cache_spec, CrossAttentionSpec), (
             "CrossAttentionManager can only be used for cross-attention groups"
diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
index b480ac78f23c..ef0f8d9e6745 100644
--- a/vllm/v1/cudagraph_dispatcher.py
+++ b/vllm/v1/cudagraph_dispatcher.py
@@ -4,6 +4,9 @@
 
 from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.forward_context import BatchDescriptor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class CudagraphDispatcher:
@@ -28,7 +31,11 @@ class CudagraphDispatcher:
     def __init__(self, vllm_config: VllmConfig):
         self.vllm_config = vllm_config
         self.compilation_config = vllm_config.compilation_config
-        self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        self.uniform_decode_query_len = (
+            1
+            if not self.vllm_config.speculative_config
+            else 1 + self.vllm_config.speculative_config.num_speculative_tokens
+        )
 
         # Dict to store valid cudagraph dispatching keys.
         self.cudagraph_keys: dict[CUDAGraphMode, set[BatchDescriptor]] = {
@@ -36,25 +43,42 @@ def __init__(self, vllm_config: VllmConfig):
             CUDAGraphMode.FULL: set(),
         }
 
-        not_use_piecewise_compilation = (
-            not self.cudagraph_mode.requires_piecewise_compilation()
-        )
-
         assert (
-            not_use_piecewise_compilation
+            not self.compilation_config.cudagraph_mode.requires_piecewise_compilation()
             or self.compilation_config.is_attention_compiled_piecewise()
         ), (
             "Compilation mode should be CompilationMode.VLLM_COMPILE when "
             "cudagraph_mode piecewise cudagraphs is used, "
             "and attention should be in splitting_ops or "
             "inductor splitting should be used. "
-            f"cudagraph_mode={self.cudagraph_mode}, "
+            f"cudagraph_mode={self.compilation_config.cudagraph_mode}, "
             f"compilation_mode={self.compilation_config.mode}, "
             f"splitting_ops={self.compilation_config.splitting_ops}"
         )
 
         self.keys_initialized = False
 
+    def _create_padded_batch_descriptor(
+        self, num_tokens: int, uniform_decode: bool, has_lora: bool
+    ) -> BatchDescriptor:
+        max_num_seqs = self.vllm_config.scheduler_config.max_num_seqs
+        uniform_decode_query_len = self.uniform_decode_query_len
+        num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens)
+
+        if uniform_decode and self.cudagraph_mode.has_mode(CUDAGraphMode.FULL):
+            num_reqs = num_tokens_padded // uniform_decode_query_len
+            assert num_tokens_padded % uniform_decode_query_len == 0
+        else:
+            uniform_decode = False
+            num_reqs = min(num_tokens_padded, max_num_seqs)
+
+        return BatchDescriptor(
+            num_tokens=num_tokens_padded,
+            num_reqs=num_reqs,
+            uniform=uniform_decode,
+            has_lora=has_lora,
+        )
+
     def add_cudagraph_key(
         self, runtime_mode: CUDAGraphMode, batch_descriptor: BatchDescriptor
     ):
@@ -66,7 +90,9 @@ def add_cudagraph_key(
     def initialize_cudagraph_keys(
         self, cudagraph_mode: CUDAGraphMode, uniform_decode_query_len: int
     ):
-        # This should be called only after attention backend is initialized.
+        # This should be called only after attention backend is initialized. So we can
+        # get the correct cudagraph mode after backend support is resolved.
+        self.cudagraph_mode = cudagraph_mode
 
         # LoRA activation cases to specialize the cuda graphs on
         if self.vllm_config.lora_config:
@@ -86,9 +112,9 @@ def initialize_cudagraph_keys(
             ):
                 self.add_cudagraph_key(
                     cudagraph_mode.mixed_mode(),
-                    BatchDescriptor(
-                        num_tokens=bs, uniform_decode=False, has_lora=has_lora
-                    ),
+                    self._create_padded_batch_descriptor(
+                        bs, False, has_lora
+                    ).relax_for_mixed_batch_cudagraphs(),
                 )
 
         # if decode cudagraph mode is FULL, and we don't already have mixed
@@ -109,40 +135,49 @@ def initialize_cudagraph_keys(
             for bs, has_lora in product(cudagraph_capture_sizes_for_decode, lora_cases):
                 self.add_cudagraph_key(
                     CUDAGraphMode.FULL,
-                    BatchDescriptor(
-                        num_tokens=bs, uniform_decode=True, has_lora=has_lora
-                    ),
+                    self._create_padded_batch_descriptor(bs, True, has_lora),
                 )
+
         self.keys_initialized = True
 
     def dispatch(
-        self, batch_descriptor: BatchDescriptor, use_cascade_attn: bool = False
-    ) -> tuple[CUDAGraphMode, BatchDescriptor | None]:
+        self,
+        num_tokens: int,
+        uniform_decode: bool,
+        has_lora: bool,
+        use_cascade_attn: bool = False,
+    ) -> tuple[CUDAGraphMode, BatchDescriptor]:
         """
         Given conditions(e.g.,batch descriptor and if using cascade attention),
         dispatch to a cudagraph runtime mode and the valid batch descriptor.
         A new batch descriptor is returned as we might dispatch a uniform batch
         to a graph that supports a more general batch (uniform to non-uniform).
         """
-        # if not initialized, just skip dispatching.
-        if not self.keys_initialized:
-            return CUDAGraphMode.NONE, None
+        if (
+            not self.keys_initialized
+            or self.cudagraph_mode == CUDAGraphMode.NONE
+            or num_tokens > self.compilation_config.max_cudagraph_capture_size
+        ):
+            return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
+
+        batch_desc = self._create_padded_batch_descriptor(
+            num_tokens, uniform_decode, has_lora
+        )
+        relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs()
 
-        non_uniform_key = batch_descriptor.non_uniform
-        # if a batch use cascade attention, bypass checking full cudagraphs
         if not use_cascade_attn:
             # check if key exists for full cudagraph
-            if batch_descriptor in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, batch_descriptor
+            if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, batch_desc
 
-            # otherwise, check if non-uniform key exists
-            if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.FULL]:
-                return CUDAGraphMode.FULL, non_uniform_key
+            # otherwise, check if the relaxed key exists
+            if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]:
+                return CUDAGraphMode.FULL, relaxed_batch_desc
 
-        # also check if non-uniform key exists for more "general"
+        # also check if the relaxed key exists for more "general"
         # piecewise cudagraph
-        if non_uniform_key in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
-            return CUDAGraphMode.PIECEWISE, non_uniform_key
+        if relaxed_batch_desc in self.cudagraph_keys[CUDAGraphMode.PIECEWISE]:
+            return CUDAGraphMode.PIECEWISE, relaxed_batch_desc
 
-        # finally, just return no cudagraphs
-        return CUDAGraphMode.NONE, None
+        # finally, just return no cudagraphs and a trivial batch descriptor
+        return CUDAGraphMode.NONE, BatchDescriptor(num_tokens)
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 058a4bcaecb5..ce2aae77108d 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -15,6 +15,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import LogprobsLists, LogprobsTensors
+from vllm.v1.serial_utils import UtilityResult
 
 # These are possible values of RequestOutput.finish_reason,
 # so form part of the external API.
@@ -71,6 +72,14 @@ class EngineCoreRequest(
 
     trace_headers: Mapping[str, str] | None = None
 
+    @property
+    def params(self) -> SamplingParams | PoolingParams:
+        """Return the processed params (sampling or pooling)."""
+        if self.sampling_params is not None:
+            return self.sampling_params
+        assert self.pooling_params is not None
+        return self.pooling_params
+
 
 class EngineCoreEventType(enum.IntEnum):
     """The type of engine core request event."""
@@ -131,13 +140,6 @@ def finished(self) -> bool:
         return self.finish_reason is not None
 
 
-class UtilityResult:
-    """Wrapper for special handling when serializing/deserializing."""
-
-    def __init__(self, r: Any = None):
-        self.result = r
-
-
 class UtilityOutput(
     msgspec.Struct,
     array_like=True,  # type: ignore[call-arg]
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index aee21fb3fffe..d0708a8a046d 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -10,11 +10,12 @@
 
 import numpy as np
 import torch
+from typing_extensions import deprecated
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.protocol import Device, EngineClient
+from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.utils import _validate_truncation_size
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
@@ -25,20 +26,20 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import init_tracer
 from vllm.transformers_utils.config import maybe_register_config_serialize_by_value
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils.async_utils import cancel_task_threadsafe
 from vllm.utils.collection_utils import as_list
-from vllm.utils.func_utils import deprecate_kwargs
 from vllm.utils.math_utils import cdiv
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
 from vllm.v1.engine.exceptions import EngineDeadError, EngineGenerateError
+from vllm.v1.engine.input_processor import InputProcessor
 from vllm.v1.engine.output_processor import OutputProcessor, RequestOutputCollector
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.engine.processor import Processor
 from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import (
     StatLoggerFactory,
@@ -113,7 +114,7 @@ def __init__(
         else:
             tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        self.processor = Processor(self.vllm_config, tokenizer)
+        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
             self.model_config.io_processor_plugin,
@@ -121,7 +122,9 @@ def __init__(
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer,
+            log_stats=self.log_stats,
+            stream_interval=self.vllm_config.scheduler_config.stream_interval,
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -151,6 +154,10 @@ def __init__(
             )
             self.logger_manager.log_engine_initialized()
 
+        # Pause / resume state for async RL workflows.
+        self._pause_cond = asyncio.Condition()
+        self._paused = False
+
         self.output_handler: asyncio.Task | None = None
         try:
             # Start output handler eagerly if we are in the asyncio eventloop.
@@ -159,11 +166,23 @@ def __init__(
         except RuntimeError:
             pass
 
-        if envs.VLLM_TORCH_PROFILER_DIR:
+        if (
+            envs.VLLM_TORCH_PROFILER_DIR
+            and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM
+        ):
             logger.info(
                 "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s",  # noqa: E501
                 envs.VLLM_TORCH_PROFILER_DIR,
             )
+            if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0:
+                logger.warning_once(
+                    "Torch profiler received max_iters or delay_iters setting. These "
+                    "are not compatible with the AsyncLLM profiler and will be ignored "
+                    "for the AsyncLLM process. Engine process profiling will still "
+                    "respect these settings. Consider setting "
+                    "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable "
+                    "AsyncLLM profiling."
+                )
             worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
             self.profiler = torch.profiler.profile(
                 activities=[
@@ -171,19 +190,23 @@ def __init__(
                 ],
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    envs.VLLM_TORCH_PROFILER_DIR, worker_name=worker_name, use_gzip=True
+                    envs.VLLM_TORCH_PROFILER_DIR,
+                    worker_name=worker_name,
+                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
                 ),
             )
         else:
             self.profiler = None
 
-    @classmethod
-    @deprecate_kwargs(
-        "disable_log_requests",
-        additional_message=(
-            "This argument will have no effect. Use `enable_log_requests` instead."
-        ),
+    @property
+    @deprecated(
+        "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. "
+        "The old name will be removed in v0.13."
     )
+    def processor(self):
+        return self.input_processor
+
+    @classmethod
     def from_vllm_config(
         cls,
         vllm_config: VllmConfig,
@@ -196,7 +219,6 @@ def from_vllm_config(
         client_addresses: dict[str, str] | None = None,
         client_count: int = 1,
         client_index: int = 0,
-        disable_log_requests: bool = True,  # Deprecated, will be removed
     ) -> "AsyncLLM":
         # Create the LLMEngine.
         return cls(
@@ -284,11 +306,7 @@ async def add_request(
             request = prompt
         else:
             assert prompt_text is None
-            logger.warning_once(
-                "Processor has been moved under OpenAIServing and will "
-                "be removed from AsyncLLM in v0.13."
-            )
-            request = self.processor.process_inputs(
+            request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
                 params,
@@ -304,14 +322,15 @@ async def add_request(
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         if is_pooling or params.n == 1:
             await self._add_request(request, prompt_text, None, 0, queue)
             return queue
 
-        # Get the updated SamplingParams from the request, which
-        # were cloned/updated in processor.process_inputs above.
-        parent_params = request.sampling_params
-        assert parent_params is not None
+        parent_params = params
+        assert isinstance(parent_params, SamplingParams)
 
         # Fan out child requests (for n>1).
         parent_request = ParentRequest(request_id, parent_params)
@@ -391,6 +410,10 @@ async def generate(
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            # Wait until generation is resumed if the engine is paused.
+            async with self._pause_cond:
+                await self._pause_cond.wait_for(lambda: not self._paused)
+
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
                 truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
@@ -467,7 +490,7 @@ def _run_output_handler(self):
         output_processor = self.output_processor
         log_stats = self.log_stats
         logger_manager = self.logger_manager
-        processor = self.processor
+        input_processor = self.input_processor
 
         async def output_handler():
             try:
@@ -518,7 +541,7 @@ async def output_handler():
                             engine_idx=outputs.engine_index,
                             scheduler_stats=outputs.scheduler_stats,
                             iteration_stats=iteration_stats,
-                            mm_cache_stats=processor.stat_mm_cache(),
+                            mm_cache_stats=input_processor.stat_mm_cache(),
                         )
             except Exception as e:
                 logger.exception("AsyncLLM output_handler failed.")
@@ -538,6 +561,58 @@ async def abort(self, request_id: str | Iterable[str]) -> None:
         if self.log_requests:
             logger.info("Aborted request(s) %s.", ",".join(request_ids))
 
+    async def pause_generation(
+        self,
+        *,
+        wait_for_inflight_requests: bool = False,
+        clear_cache: bool = True,
+    ) -> None:
+        """
+        Pause generation to allow model weight updates.
+
+        New generation/encoding requests are blocked until resume.
+
+        Args:
+            wait_for_inflight_requests: When ``True`` waits for in-flight
+                requests to finish before pausing. When ``False`` (default),
+                immediately aborts any in-flight requests.
+            clear_cache: Whether to clear KV cache and prefix cache after
+                draining. Set to ``False`` to preserve cache for faster resume.
+                Default is ``True`` (clear caches).
+        """
+
+        async with self._pause_cond:
+            if self._paused:
+                return
+            self._paused = True
+
+        if not wait_for_inflight_requests:
+            request_ids = list(self.output_processor.request_states.keys())
+            if request_ids:
+                await self.abort(request_ids)
+
+        # Wait for running requests to drain before clearing cache.
+        if self.output_processor.has_unfinished_requests():
+            await self.output_processor.wait_for_requests_to_drain()
+
+        # Clear cache
+        if clear_cache:
+            await self.reset_prefix_cache()
+            await self.reset_mm_cache()
+
+    async def resume_generation(self) -> None:
+        """Resume generation after :meth:`pause_generation`."""
+
+        async with self._pause_cond:
+            self._paused = False
+            self._pause_cond.notify_all()  # Wake up all waiting requests
+
+    async def is_paused(self) -> bool:
+        """Return whether the engine is currently paused."""
+
+        async with self._pause_cond:
+            return self._paused
+
     async def encode(
         self,
         prompt: PromptType,
@@ -569,6 +644,10 @@ async def encode(
             # to handle startup failure gracefully in the OpenAI server.
             self._run_output_handler()
 
+            # Respect pause state before accepting new requests.
+            async with self._pause_cond:
+                await self._pause_cond.wait_for(lambda: not self._paused)
+
             if tokenization_kwargs is None:
                 tokenization_kwargs = {}
             _validate_truncation_size(
@@ -628,17 +707,17 @@ async def encode(
             raise EngineGenerateError() from e
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
-        return self.processor.tokenizer
+    def tokenizer(self) -> TokenizerLike | None:
+        return self.input_processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.processor.tokenizer = tokenizer
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
+        self.input_processor.tokenizer = tokenizer
 
-    async def get_tokenizer(self) -> AnyTokenizer:
+    async def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "Unable to get tokenizer because skip_tokenizer_init is True"
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
@@ -668,12 +747,10 @@ async def stop_profile(self) -> None:
         await asyncio.gather(*coros)
 
     async def reset_mm_cache(self) -> None:
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         await self.engine_core.reset_mm_cache_async()
 
-    async def reset_prefix_cache(self, device: Device | None = None) -> None:
-        if device == Device.CPU:
-            raise ValueError("Not supported on CPU.")
+    async def reset_prefix_cache(self) -> None:
         await self.engine_core.reset_prefix_cache_async()
 
     async def sleep(self, level: int = 1) -> None:
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ffb5232e770d..e3a5f51a8fc5 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -63,7 +63,6 @@
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
 from vllm.v1.structured_output import StructuredOutputManager
-from vllm.v1.utils import record_function_or_nullcontext
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -120,15 +119,17 @@ def __init__(
         # Setup scheduler.
         Scheduler = vllm_config.scheduler_config.get_scheduler_cls()
 
-        if len(kv_cache_config.kv_cache_groups) == 0:
+        if len(kv_cache_config.kv_cache_groups) == 0:  # noqa: SIM102
             # Encoder models without KV cache don't support
             # chunked prefill. But do SSM models?
-            logger.info("Disabling chunked prefill for model without KVCache")
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            if vllm_config.scheduler_config.enable_chunked_prefill:
+                logger.warning("Disabling chunked prefill for model without KVCache")
+                vllm_config.scheduler_config.enable_chunked_prefill = False
 
         scheduler_block_size = (
             vllm_config.cache_config.block_size
             * vllm_config.parallel_config.decode_context_parallel_size
+            * vllm_config.parallel_config.prefill_context_parallel_size
         )
 
         self.scheduler: SchedulerInterface = Scheduler(
@@ -181,11 +182,14 @@ def __init__(
             logger.info("Batch queue is enabled with size %d", self.batch_queue_size)
             self.batch_queue = deque(maxlen=self.batch_queue_size)
 
+        self.is_ec_producer = (
+            vllm_config.ec_transfer_config is not None
+            and vllm_config.ec_transfer_config.is_ec_producer
+        )
+        self.is_pooling_model = vllm_config.model_config.runner_type == "pooling"
+
         self.request_block_hasher: Callable[[Request], list[BlockHash]] | None = None
-        if (
-            self.vllm_config.cache_config.enable_prefix_caching
-            or kv_connector is not None
-        ):
+        if vllm_config.cache_config.enable_prefix_caching or kv_connector is not None:
             caching_hash_fn = get_hash_fn_by_name(
                 vllm_config.cache_config.prefix_caching_hash_algo
             )
@@ -198,10 +202,13 @@ def __init__(
         self.step_fn = (
             self.step if self.batch_queue is None else self.step_with_batch_queue
         )
+        self.async_scheduling = vllm_config.scheduler_config.async_scheduling
 
         # Mark the startup heap as static so that it's ignored by GC.
         # Reduces pause times of oldest generation collections.
         freeze_gc_heap()
+        # If enable, attach GC debugger after static variable freeze.
+        maybe_attach_gc_debug_callback()
 
     def _initialize_kv_caches(
         self, vllm_config: VllmConfig
@@ -245,7 +252,7 @@ def _initialize_kv_caches(
 
         elapsed = time.time() - start
         logger.info_once(
-            ("init engine (profile, create kv cache, warmup model) took %.2f seconds"),
+            "init engine (profile, create kv cache, warmup model) took %.2f seconds",
             elapsed,
             scope="local",
         )
@@ -311,6 +318,16 @@ def log_error_detail(self, scheduler_output: SchedulerOutput):
             )
             raise err
 
+    def _log_err_callback(self, scheduler_output: SchedulerOutput):
+        """Log error details of a future that's not expected to return a result."""
+
+        def callback(f, sched_output=scheduler_output):
+            with self.log_error_detail(sched_output):
+                result = f.result()
+                assert result is None
+
+        return callback
+
     def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         """Schedule, execute, and make output.
 
@@ -322,26 +339,25 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         # or finished and not yet removed from the batch.
         if not self.scheduler.has_requests():
             return {}, False
-        with record_function_or_nullcontext("core step: schedule"):
-            scheduler_output = self.scheduler.schedule()
-
-        with record_function_or_nullcontext("core step: execute_model"):
-            future = self.model_executor.execute_model(scheduler_output, non_block=True)
-            grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
-            with self.log_error_detail(scheduler_output):
-                model_output = future.result()
-                if model_output is None:
-                    model_output = self.model_executor.sample_tokens(grammar_output)
-
-        with record_function_or_nullcontext("core step: update_from_output"):
-            engine_core_outputs = self.scheduler.update_from_output(
-                scheduler_output, model_output
-            )
+        scheduler_output = self.scheduler.schedule()
+        future = self.model_executor.execute_model(scheduler_output, non_block=True)
+        grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
+            if model_output is None:
+                model_output = self.model_executor.sample_tokens(grammar_output)
+
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output
+        )
 
         return engine_core_outputs, scheduler_output.total_num_scheduled_tokens > 0
 
     def post_step(self, model_executed: bool) -> None:
-        if self.use_spec_decode and model_executed:
+        # When using async scheduling we can't get draft token ids in advance,
+        # so we update draft token ids in the worker process and don't
+        # need to update draft token ids here.
+        if not self.async_scheduling and self.use_spec_decode and model_executed:
             # Take the draft token ids.
             draft_token_ids = self.model_executor.take_draft_token_ids()
             if draft_token_ids is not None:
@@ -374,52 +390,34 @@ def step_with_batch_queue(
         model_executed = False
         deferred_scheduler_output = None
         if self.scheduler.has_requests():
-            with record_function_or_nullcontext("core step_with_batch_queue: schedule"):
-                scheduler_output = self.scheduler.schedule()
-            with record_function_or_nullcontext(
-                "core step_with_batch_queue: execute_model"
-            ):
-                exec_future = self.model_executor.execute_model(
-                    scheduler_output, non_block=True
-                )
-            model_executed = scheduler_output.total_num_scheduled_tokens > 0
+            scheduler_output = self.scheduler.schedule()
+            exec_future = self.model_executor.execute_model(
+                scheduler_output, non_block=True
+            )
+            if not self.is_ec_producer:
+                model_executed = scheduler_output.total_num_scheduled_tokens > 0
 
-            if scheduler_output.pending_structured_output_tokens:
-                with record_function_or_nullcontext(
-                    "core step_with_batch_queue: pending_structured_output_tokens"
-                ):
-                    # We need to defer sampling until we have processed the model output
-                    # from the prior step.
-                    deferred_scheduler_output = scheduler_output
-                    # Block-wait for execute to return
-                    # (continues running async on the GPU).
-                    with self.log_error_detail(scheduler_output):
-                        exec_result = exec_future.result()
-                        assert exec_result is None
+            if self.is_pooling_model or not model_executed:
+                # No sampling required (no requests scheduled).
+                future = cast(Future[ModelRunnerOutput], exec_future)
             else:
-                with record_function_or_nullcontext(
-                    "core step_with_batch_queue: get_grammar_bitmask"
-                ):
-                    # We aren't waiting for any tokens, get any grammar
-                    # output immediately.
+                exec_future.add_done_callback(self._log_err_callback(scheduler_output))
+
+                if not scheduler_output.pending_structured_output_tokens:
+                    # We aren't waiting for any tokens, get any grammar output
+                    # and sample immediately.
                     grammar_output = self.scheduler.get_grammar_bitmask(
                         scheduler_output
                     )
-                # Block-wait for execute to return (continues running async on the GPU).
-                with self.log_error_detail(scheduler_output):
-                    exec_result = exec_future.result()
-
-                if exec_result is None:
-                    with record_function_or_nullcontext(
-                        "core step_with_batch_queue: sample_tokens"
-                    ):
-                        # Call sample tokens.
-                        future = self.model_executor.sample_tokens(
-                            grammar_output, non_block=True
-                        )
+                    future = self.model_executor.sample_tokens(
+                        grammar_output, non_block=True
+                    )
                 else:
-                    # No sampling required (e.g. all requests finished).
-                    future = cast(Future[ModelRunnerOutput], exec_future)
+                    # We need to defer sampling until we have processed the model output
+                    # from the prior step.
+                    deferred_scheduler_output = scheduler_output
+
+            if not deferred_scheduler_output:
                 # Add this step's future to the queue.
                 batch_queue.appendleft((future, scheduler_output))
                 if (
@@ -436,34 +434,27 @@ def step_with_batch_queue(
             # only be called when the scheduler contains requests or the queue
             # is non-empty.
             return None, False
-        with record_function_or_nullcontext("core step_with_batch_queue: model_output"):
-            # Block until the next result is available.
-            future, scheduler_output = batch_queue.pop()
-            with self.log_error_detail(scheduler_output):
-                model_output = future.result()
-        with record_function_or_nullcontext(
-            "core step_with_batch_queue: update_from_output"
-        ):
-            engine_core_outputs = self.scheduler.update_from_output(
-                scheduler_output, model_output
-            )
+
+        # Block until the next result is available.
+        future, scheduler_output = batch_queue.pop()
+        with self.log_error_detail(scheduler_output):
+            model_output = future.result()
+
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, model_output
+        )
 
         # NOTE(nick): We can either handle the deferred tasks here or save
         # in a field and do it immediately once step_with_batch_queue is
         # re-called. The latter slightly favors TTFT over TPOT/throughput.
         if deferred_scheduler_output:
-            with record_function_or_nullcontext(
-                "core step_with_batch_queue: deferred_scheduler_output"
-            ):
-                # We now have the tokens needed to compute the bitmask for the
-                # deferred request. Get the bitmask and call sample tokens.
-                grammar_output = self.scheduler.get_grammar_bitmask(
-                    deferred_scheduler_output
-                )
-                future = self.model_executor.sample_tokens(
-                    grammar_output, non_block=True
-                )
-                batch_queue.appendleft((future, deferred_scheduler_output))
+            # We now have the tokens needed to compute the bitmask for the
+            # deferred request. Get the bitmask and call sample tokens.
+            grammar_output = self.scheduler.get_grammar_bitmask(
+                deferred_scheduler_output
+            )
+            future = self.model_executor.sample_tokens(grammar_output, non_block=True)
+            batch_queue.appendleft((future, deferred_scheduler_output))
 
         return engine_core_outputs, model_executed
 
@@ -657,9 +648,6 @@ def __init__(
                 assert addresses.coordinator_input is not None
                 logger.info("Waiting for READY message from DP Coordinator...")
 
-        # If enable, attach GC debugger after static variable freeze.
-        maybe_attach_gc_debug_callback()
-
         # Enable environment variable cache (e.g. assume no more
         # environment variable overrides after this point)
         enable_envs_cache()
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index b7a24096bf15..dce8765fcf6b 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -9,8 +9,8 @@
 from transformers import PreTrainedTokenizerFast
 
 from vllm.logger import init_logger
-from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer,
+from vllm.tokenizers import TokenizerLike
+from vllm.tokenizers.detokenizer_utils import (
     convert_prompt_ids_to_tokens,
     detokenize_incrementally,
 )
@@ -45,7 +45,7 @@ def get_next_output_text(self, finished: bool, delta: bool) -> str:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
     ) -> "IncrementalDetokenizer":
         assert request.sampling_params is not None
@@ -256,7 +256,7 @@ def _protected_step(self, next_token_id: int) -> str | None:
 
 
 class SlowIncrementalDetokenizer(BaseIncrementalDetokenizer):
-    def __init__(self, tokenizer: AnyTokenizer, request: EngineCoreRequest):
+    def __init__(self, tokenizer: TokenizerLike, request: EngineCoreRequest):
         super().__init__(request)
 
         self.tokenizer = tokenizer
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
new file mode 100644
index 000000000000..e6a94f4e3de5
--- /dev/null
+++ b/vllm/v1/engine/input_processor.py
@@ -0,0 +1,636 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import time
+from collections.abc import Mapping
+from typing import Any, Literal, cast
+
+from vllm.config import VllmConfig
+from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
+from vllm.inputs.parse import split_enc_dec_inputs
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.cache import processor_cache_from_config
+from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
+from vllm.multimodal.parse import MultiModalDataParser
+from vllm.multimodal.processing import EncDecMultiModalProcessor
+from vllm.multimodal.utils import argsort_mm_positions
+from vllm.pooling_params import PoolingParams
+from vllm.sampling_params import SamplingParams
+from vllm.tokenizers import MistralTokenizer, TokenizerLike
+from vllm.utils import length_from_prompt_token_ids_or_embeds
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.metrics.stats import MultiModalCacheStats
+from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
+from vllm.v1.structured_output.backend_lm_format_enforcer import (
+    validate_structured_output_request_lm_format_enforcer,
+)
+from vllm.v1.structured_output.backend_outlines import (
+    validate_structured_output_request_outlines,
+)
+from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
+
+logger = init_logger(__name__)
+
+
+class InputProcessor:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        tokenizer: TokenizerLike | None,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ) -> None:
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.lora_config = vllm_config.lora_config
+        self.structured_outputs_config = vllm_config.structured_outputs_config
+
+        self.generation_config_fields = self.model_config.try_get_generation_config()
+
+        self.mm_registry = mm_registry
+        self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
+
+        self.input_preprocessor = InputPreprocessor(
+            self.model_config,
+            tokenizer,
+            mm_registry,
+            mm_processor_cache=self.mm_processor_cache,
+        )
+
+    @property
+    def tokenizer(self) -> TokenizerLike | None:
+        return self.input_preprocessor.tokenizer
+
+    @tokenizer.setter
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
+        self.input_preprocessor.tokenizer = tokenizer
+
+    def _validate_logprobs(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        max_logprobs = self.model_config.max_logprobs
+        if max_logprobs == -1:
+            max_logprobs = self.model_config.get_vocab_size()
+
+        # Validate sample logprobs.
+        if params.logprobs:
+            num_logprobs = params.logprobs
+            if num_logprobs == -1:
+                num_logprobs = self.model_config.get_vocab_size()
+            if num_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested sample logprobs of {num_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
+
+        # Validate prompt logprobs.
+        if params.prompt_logprobs:
+            num_prompt_logprobs = params.prompt_logprobs
+            if num_prompt_logprobs == -1:
+                num_prompt_logprobs = self.model_config.get_vocab_size()
+            if num_prompt_logprobs > max_logprobs:
+                raise ValueError(
+                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
+                    f"which is greater than max allowed: {max_logprobs}"
+                )
+
+    def _validate_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        self._validate_structured_output(params)
+        self._validate_logit_bias(params)
+
+        if params.allowed_token_ids is None:
+            return
+        if not params.allowed_token_ids:
+            raise ValueError("allowed_token_ids is not None and empty!")
+        if self.tokenizer is None:
+            # When skip_tokenizer_init=True, we can't validate token IDs
+            # Skip validation and let the model handle invalid tokens
+            return
+        vocab_size = len(self.tokenizer)
+        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
+            raise ValueError("allowed_token_ids contains out-of-vocab token id!")
+
+    def _validate_logit_bias(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        """Validate logit_bias token IDs are within vocabulary range."""
+        if not params.logit_bias:
+            return
+
+        vocab_size = self.model_config.get_vocab_size()
+        invalid_token_ids = []
+
+        for token_id in params.logit_bias:
+            if token_id < 0 or token_id >= vocab_size:
+                invalid_token_ids.append(token_id)
+
+        if invalid_token_ids:
+            raise ValueError(
+                f"token_id(s) {invalid_token_ids} in logit_bias contain "
+                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
+            )
+
+    def _validate_supported_sampling_params(
+        self,
+        params: SamplingParams,
+    ) -> None:
+        # Logits processors not supported.
+        if params.logits_processors:
+            raise ValueError(
+                "vLLM V1 does not support per request user provided logits processors."
+            )
+        # Async scheduling + spec decode currently incompatible with some
+        # sampling parameters.
+        if (
+            self.vllm_config.speculative_config is not None
+            and self.vllm_config.scheduler_config.async_scheduling
+            and (
+                params.frequency_penalty != 0.0
+                or params.presence_penalty != 0.0
+                or params.repetition_penalty != 1.0
+                or params.bad_words_token_ids
+                or params.structured_outputs
+            )
+        ):
+            raise ValueError(
+                "async scheduling with spec decoding doesn't yet support "
+                "penalties, bad words or structured outputs in sampling parameters."
+            )
+
+    def _validate_params(
+        self,
+        params: SamplingParams | PoolingParams,
+    ):
+        """
+        Validate supported SamplingParam.
+        Should raise ValueError if unsupported for API Server.
+        """
+
+        if isinstance(params, PoolingParams):
+            return
+
+        self._validate_logprobs(params)
+        self._validate_sampling_params(params)
+        self._validate_supported_sampling_params(params)
+
+    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
+        """
+        Validate that user-provided multi_modal_uuids align with
+        multi_modal_data in the incoming request prompt(s).
+        Only checks lengths; `None` entries are allowed and will be
+        auto-hashed downstream.
+        """
+
+        def _validate_single_prompt(single_prompt: dict | str) -> None:
+            if not isinstance(single_prompt, dict):
+                return
+            mm_data = single_prompt.get("multi_modal_data")
+            mm_uuids = single_prompt.get("multi_modal_uuids")
+            if not mm_data or not mm_uuids:
+                return
+
+            for modality, items in mm_data.items():
+                if modality in mm_uuids:
+                    data_len = len(items) if isinstance(items, list) else 1
+                    uuid_len = (
+                        len(mm_uuids[modality])
+                        if isinstance(mm_uuids[modality], list)
+                        else 1
+                    )
+                    if uuid_len != data_len:
+                        raise ValueError(
+                            f"multi_modal_uuids for modality '{modality}' "
+                            "must have same length as data: got "
+                            f"{uuid_len} uuids vs "
+                            f"{data_len} items."
+                        )
+                else:
+                    raise ValueError(
+                        f"multi_modal_uuids for modality '{modality}' must "
+                        "be provided if multi_modal_data is provided."
+                    )
+
+        # Handle explicit encoder/decoder prompts or singleton prompt
+        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
+            enc = prompt.get("encoder_prompt")
+            dec = prompt.get("decoder_prompt")
+            if enc is not None:
+                _validate_single_prompt(cast(dict | str, enc))
+            if dec is not None:
+                _validate_single_prompt(cast(dict | str, dec))
+        else:
+            _validate_single_prompt(prompt)  # type: ignore[arg-type]
+
+    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
+        if lora_request is None:
+            return
+
+        # LoRA request passed in while LoRA is not enabled
+        if not self.lora_config:
+            raise ValueError(
+                f"Got lora_request {lora_request} but LoRA is not enabled!"
+            )
+
+        if self.tokenizer is not None:
+            logger.warning_once(
+                "vLLM has deprecated support for supporting different "
+                "tokenizers for different LoRAs. By default, vLLM uses base "
+                "model's tokenizer. If you are using a LoRA "
+                "with its own tokenizer, consider specifying `--tokenizer "
+                "[lora_path]` to use the LoRA tokenizer."
+            )
+
+    def _validate_structured_output(self, params: SamplingParams) -> None:
+        if not params.structured_outputs or not self.structured_outputs_config:
+            return
+
+        if self.model_config.skip_tokenizer_init and params.structured_outputs:
+            raise ValueError(
+                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
+            )
+
+        backend = self.structured_outputs_config.backend
+        if _backend := params.structured_outputs._backend:
+            # Request-level backend selection is not supported.
+            # The values may differ if `params` is reused and was set
+            # to a specific backend based on `auto` behavior in a previous
+            # request. We remember that it was set as a result of `auto`
+            # using the `_backend_was_auto` field set in the params.
+            if backend != _backend and not (
+                backend == "auto" and params.structured_outputs._backend_was_auto
+            ):
+                raise ValueError(
+                    "Request-level structured output backend selection is not "
+                    f"supported. The request specified '{_backend}', but vLLM "
+                    f"was initialised with '{backend}'. This error can be "
+                    "resolved by removing '_backend' from the request."
+                )
+        else:
+            params.structured_outputs._backend = backend
+
+        # Request content validation
+        if (
+            isinstance(params.structured_outputs.choice, list)
+            and not params.structured_outputs.choice
+        ):
+            # It is invalid for choice to be an empty list
+            raise ValueError(
+                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
+            )
+        # Reject empty string grammar early to avoid engine-side crashes
+        if (
+            isinstance(params.structured_outputs.grammar, str)
+            and params.structured_outputs.grammar.strip() == ""
+        ):
+            raise ValueError("structured_outputs.grammar cannot be an empty string")
+
+        if backend.startswith("xgrammar"):
+            # xgrammar with no fallback
+            validate_xgrammar_grammar(params)
+        elif backend.startswith("guidance"):
+            # TODO: ideally we would have the LLTokenizer here as Lark syntax
+            # allows <|special_token|> and similar, see
+            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
+            # Without tokenizer these are disallowed in grammars.
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'guidance' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
+            validate_guidance_grammar(params, tokenizer=None)
+        elif backend == "outlines":
+            # outlines backend
+            validate_structured_output_request_outlines(params)
+        elif backend == "lm-format-enforcer":
+            # lm format enforcer backend
+            if isinstance(self.tokenizer, MistralTokenizer):
+                raise ValueError(
+                    "Mistral tokenizer is not supported for the 'lm-format-enforcer' "
+                    "structured output backend. Please use ['xgrammar', 'outlines'] "
+                    "backends or tokenizer_mode='hf' instead."
+                )
+            validate_structured_output_request_lm_format_enforcer(params)
+        else:
+            # NOTE: backend must be "auto" here, because we have
+            # checked supported_backends above.
+            # In this mode, we set opinionated defaults based on what we think
+            # will satisfy the most use cases without having to worry about
+            # this setting. We include fallback behavior here, but not with any
+            # other setting where a specific backend was specified.
+            try:
+                validate_xgrammar_grammar(params)
+                params.structured_outputs._backend = "xgrammar"
+            except ValueError:
+                # The request either failed validation
+                # or includes some jsonschema feature(s) that
+                # are not supported in xgrammar.
+                if isinstance(self.tokenizer, MistralTokenizer):
+                    # Fall back to outlines if the tokenizer is Mistral
+                    validate_structured_output_request_outlines(params)
+                    params.structured_outputs._backend = "outlines"
+                else:
+                    # Fall back to guidance by default.
+                    validate_guidance_grammar(params, tokenizer=None)
+                    params.structured_outputs._backend = "guidance"
+            # Remember that this backend was set automatically
+            params.structured_outputs._backend_was_auto = True
+
+    def _maybe_build_mm_uuids(
+        self,
+        request_id: str,
+        prompt: PromptType,
+    ) -> MultiModalUUIDDict | None:
+        """Build per-item multimodal hash overrides when enabled. In this case,
+        multimodal data items are identified by their request id, modality and
+        index rather than their content.
+
+        Returns a dictionary of modality -> list[str] of overrides, or None if
+        disabled or no multimodal data is present.
+        """
+
+        def _extract_mm_data(p: PromptType):
+            if isinstance(p, dict) and "encoder_prompt" in p:
+                enc = p.get("encoder_prompt")
+                if isinstance(enc, dict):
+                    return enc.get("multi_modal_data")
+                return None
+            if isinstance(p, dict):
+                return p.get("multi_modal_data")
+            return None
+
+        mm_data = _extract_mm_data(prompt)
+        if not mm_data:
+            return None
+
+        mm_uuids: dict[str, list[str | None] | str] = {}
+        for modality, data in mm_data.items():
+            # Hash each item for embedding inputs.
+            n = (
+                len(data)
+                if isinstance(data, list) or MultiModalDataParser.is_embeddings(data)
+                else 1
+            )
+            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
+        return mm_uuids
+
+    def process_inputs(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: SamplingParams | PoolingParams,
+        arrival_time: float | None = None,
+        lora_request: LoRARequest | None = None,
+        tokenization_kwargs: dict[str, Any] | None = None,
+        trace_headers: Mapping[str, str] | None = None,
+        priority: int = 0,
+        data_parallel_rank: int | None = None,
+    ) -> EngineCoreRequest:
+        self._validate_lora(lora_request)
+        self._validate_params(params)
+
+        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
+        if data_parallel_rank is not None and not (
+            0 <= data_parallel_rank < data_parallel_size
+        ):
+            raise ValueError(
+                f"data_parallel_rank {data_parallel_rank} "
+                f"is out of range [0, {data_parallel_size})."
+            )
+
+        if arrival_time is None:
+            arrival_time = time.time()
+
+        # Optionally generate multimodal hash overrides to avoid hashing
+        # multimodal data items by their content as their identifiers.
+
+        # NOTE: when users explicitly turn off BOTH prefix caching and input
+        # processing caching, no multimodal features or embeddings will be
+        # reused across requests, therefore identifying multimodal data items
+        # by their content is no longer necessary, and we create uuids with
+        # request id-modality-index as multimodal hash overrides.
+        if (
+            self.model_config.multimodal_config
+            and self.model_config.multimodal_config.mm_processor_cache_gb == 0
+            and not self.cache_config.enable_prefix_caching
+        ):
+            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
+        else:
+            # Otherwise, use user-provided uuids as multimodal hash overrides
+            # if provided.
+            self._validate_multi_modal_uuids(prompt)
+            if isinstance(prompt, dict):
+                mm_uuids = cast(
+                    MultiModalUUIDDict | None, prompt.get("multi_modal_uuids")
+                )
+            else:
+                mm_uuids = None
+
+        # Process inputs, which includes:
+        # 1. Tokenize text prompt, with LoRA request if one exists.
+        # 2. For multimodal models with a merged preprocessor, preprocess
+        #   multimodal data and expand prompt token ids accordingly.
+        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
+            prompt,
+            tokenization_kwargs=tokenization_kwargs,
+            mm_uuids=mm_uuids,
+        )
+        from vllm.platforms import current_platform
+
+        current_platform.validate_request(
+            prompt=prompt,
+            params=params,
+            processed_inputs=processed_inputs,
+        )
+
+        eos_token_id = self.input_preprocessor.get_eos_token_id()
+
+        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
+        self._validate_model_inputs(encoder_inputs, decoder_inputs)
+
+        # Mypy can be conservative for TypedDict unions; normalize access.
+        if decoder_inputs["type"] == "embeds":
+            prompt_token_ids = None
+            prompt_embeds = decoder_inputs["prompt_embeds"]
+        else:
+            prompt_token_ids = decoder_inputs["prompt_token_ids"]
+            prompt_embeds = None
+
+        sampling_params = None
+        pooling_params = None
+        if isinstance(params, SamplingParams):
+            # TODO: can we avoid cloning here in multiproc case?
+            sampling_params = params.clone()
+            # If unset max tokens, then generate up to the max_model_len.
+            if sampling_params.max_tokens is None:
+                seq_len = length_from_prompt_token_ids_or_embeds(
+                    prompt_token_ids, prompt_embeds
+                )
+                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
+            sampling_params.update_from_generation_config(
+                self.generation_config_fields, eos_token_id
+            )
+            if self.tokenizer is not None:
+                sampling_params.update_from_tokenizer(self.tokenizer)
+        else:
+            pooling_params = params.clone()
+
+        # Multimodal related.
+        mm_features: list[MultiModalFeatureSpec] | None = None
+
+        if decoder_inputs["type"] == "multimodal":
+            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
+            decoder_mm_positions = decoder_inputs["mm_placeholders"]
+            decoder_mm_hashes = decoder_inputs["mm_hashes"]
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
+
+            mm_features = []
+            for modality, idx in sorted_mm_idxs:
+                mm_features.append(
+                    MultiModalFeatureSpec(
+                        data=decoder_mm_inputs[modality][idx],
+                        modality=modality,
+                        identifier=decoder_mm_hashes[modality][idx],
+                        mm_position=decoder_mm_positions[modality][idx],
+                    )
+                )
+
+        return EngineCoreRequest(
+            request_id=request_id,
+            prompt_token_ids=prompt_token_ids,
+            prompt_embeds=prompt_embeds,
+            mm_features=mm_features,
+            sampling_params=sampling_params,
+            pooling_params=pooling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
+            cache_salt=decoder_inputs.get("cache_salt"),
+            priority=priority,
+            data_parallel_rank=data_parallel_rank,
+            trace_headers=trace_headers,
+        )
+
+    def _validate_model_inputs(
+        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
+    ):
+        if encoder_inputs is not None:
+            self._validate_model_input(encoder_inputs, prompt_type="encoder")
+
+        self._validate_model_input(decoder_inputs, prompt_type="decoder")
+
+    def _validate_model_input(
+        self,
+        prompt_inputs: SingletonInputs,
+        *,
+        prompt_type: Literal["encoder", "decoder"],
+    ):
+        model_config = self.model_config
+
+        prompt_ids = (
+            None
+            if prompt_inputs["type"] == "embeds"
+            else prompt_inputs["prompt_token_ids"]
+        )
+        prompt_embeds = (
+            prompt_inputs["prompt_embeds"]
+            if prompt_inputs["type"] == "embeds"
+            else None
+        )
+        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
+        if not prompt_ids:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                pass  # Mllama may have empty encoder inputs for text-only data
+            elif prompt_inputs["type"] == "embeds":
+                pass  # Prompt embeds should not have prompt_ids.
+            else:
+                raise ValueError(f"The {prompt_type} prompt cannot be empty")
+
+        tokenizer = self.tokenizer
+        if tokenizer is not None:
+            max_input_id = max(prompt_ids or [], default=0)
+
+            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
+            # self.model_config.get_vocab_size() is the model’s vocab size.
+            # For Qwen3 models, the language model has extra tokens that do
+            # not exist in the tokenizer, and vice versa for multimodal
+            # placeholder tokens in some multimodal models.
+            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
+            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
+
+            # Here we take the max of the two to determine if a token id is
+            # truly out-of-vocabulary.
+            if max_input_id > max(
+                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
+            ):
+                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
+
+        max_prompt_len = self.model_config.max_model_len
+        if prompt_len > max_prompt_len:
+            if prompt_type == "encoder" and model_config.is_multimodal_model:
+                mm_registry = self.input_preprocessor.mm_registry
+                mm_processor = mm_registry.create_processor(
+                    model_config,
+                    tokenizer=tokenizer,
+                )
+                assert isinstance(mm_processor, EncDecMultiModalProcessor)
+
+                if mm_processor.pad_dummy_encoder_prompt:
+                    return  # Skip encoder length check for Whisper
+
+            if model_config.is_multimodal_model:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well."
+                )
+            else:
+                suggestion = (
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens."
+                )
+
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) is "
+                f"longer than the maximum model length of {max_prompt_len}. "
+                f"{suggestion}"
+            )
+
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
+        if (
+            prompt_len == max_prompt_len
+            and prompt_type == "decoder"
+            and not model_config.is_multimodal_model
+            and self.model_config.runner_type != "pooling"
+        ):
+            suggestion = (
+                "Make sure that `max_model_len` is no smaller than the "
+                "number of text tokens (prompt + requested output tokens)."
+            )
+            raise ValueError(
+                f"The {prompt_type} prompt (length {prompt_len}) plus the number of "
+                f"requested output tokens (at least 1) is longer than the maximum "
+                f"model length of {max_prompt_len}. {suggestion}"
+            )
+
+    def stat_mm_cache(self) -> MultiModalCacheStats | None:
+        return self.input_preprocessor.stat_mm_cache()
+
+    def clear_mm_cache(self) -> None:
+        self.input_preprocessor.clear_mm_cache()
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 6224af5700b7..a3bde7ba8d64 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -7,14 +7,13 @@
 from typing import Any, cast
 
 import torch.nn as nn
-from typing_extensions import TypeVar
+from typing_extensions import TypeVar, deprecated
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.protocol import Device
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -24,14 +23,15 @@
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 from vllm.tasks import SupportedTask
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import init_tracer
-from vllm.transformers_utils.tokenizer import AnyTokenizer, init_tokenizer_from_configs
+from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.input_processor import InputProcessor
 from vllm.v1.engine.output_processor import OutputProcessor
 from vllm.v1.engine.parallel_sampling import ParentRequest
-from vllm.v1.engine.processor import Processor
 from vllm.v1.executor import Executor
 from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager
 from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
@@ -89,7 +89,7 @@ def __init__(
         else:
             tokenizer = init_tokenizer_from_configs(self.model_config)
 
-        self.processor = Processor(self.vllm_config, tokenizer)
+        self.input_processor = InputProcessor(self.vllm_config, tokenizer)
         self.io_processor = get_io_processor(
             self.vllm_config,
             self.model_config.io_processor_plugin,
@@ -97,7 +97,9 @@ def __init__(
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
+            self.tokenizer,
+            log_stats=self.log_stats,
+            stream_interval=self.vllm_config.scheduler_config.stream_interval,
         )
         endpoint = self.observability_config.otlp_traces_endpoint
         if endpoint is not None:
@@ -135,6 +137,14 @@ def __init__(
         # Don't keep the dummy data in memory
         self.reset_mm_cache()
 
+    @property
+    @deprecated(
+        "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. "
+        "The old name will be removed in v0.13."
+    )
+    def processor(self):
+        return self.input_processor
+
     @classmethod
     def from_vllm_config(
         cls,
@@ -231,11 +241,7 @@ def add_request(
             request = prompt
         else:
             assert prompt_text is None
-            logger.warning_once(
-                "Processor has been moved under LLM and will "
-                "be removed from LLMEngine in v0.13."
-            )
-            request = self.processor.process_inputs(
+            request = self.input_processor.process_inputs(
                 request_id,
                 prompt,
                 params,
@@ -250,6 +256,9 @@ def add_request(
             elif isinstance(prompt, Mapping):
                 prompt_text = cast(str | None, prompt.get("prompt"))
 
+        # Use cloned params that may have been updated in process_inputs()
+        params = request.params
+
         n = params.n if isinstance(params, SamplingParams) else 1
 
         if n == 1:
@@ -262,10 +271,10 @@ def add_request(
         # Fan out child requests (for n>1).
         parent_req = ParentRequest(request_id, params)
         for idx in range(n):
-            request_id, params = parent_req.get_child_info(idx)
+            request_id, child_params = parent_req.get_child_info(idx)
             child_request = request if idx == n - 1 else copy(request)
             child_request.request_id = request_id
-            child_request.sampling_params = params
+            child_request.sampling_params = child_params
 
             # Make a new RequestState and queue.
             self.output_processor.add_request(
@@ -304,7 +313,7 @@ def step(self) -> list[RequestOutput | PoolingRequestOutput]:
                 self.logger_manager.record(
                     scheduler_stats=outputs.scheduler_stats,
                     iteration_stats=iteration_stats,
-                    mm_cache_stats=self.processor.stat_mm_cache(),
+                    mm_cache_stats=self.input_processor.stat_mm_cache(),
                 )
                 self.do_log_stats_with_interval()
 
@@ -317,10 +326,10 @@ def stop_profile(self):
         self.engine_core.profile(False)
 
     def reset_mm_cache(self):
-        self.processor.clear_mm_cache()
+        self.input_processor.clear_mm_cache()
         self.engine_core.reset_mm_cache()
 
-    def reset_prefix_cache(self, device: Device | None = None):
+    def reset_prefix_cache(self):
         self.engine_core.reset_prefix_cache()
 
     def sleep(self, level: int = 1):
@@ -343,17 +352,17 @@ def get_metrics(self) -> list[Metric]:
         return get_metrics_snapshot()
 
     @property
-    def tokenizer(self) -> AnyTokenizer | None:
-        return self.processor.tokenizer
+    def tokenizer(self) -> TokenizerLike | None:
+        return self.input_processor.tokenizer
 
     @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.processor.tokenizer = tokenizer
+    def tokenizer(self, tokenizer: TokenizerLike | None) -> None:
+        self.input_processor.tokenizer = tokenizer
 
-    def get_tokenizer(self) -> AnyTokenizer:
+    def get_tokenizer(self) -> TokenizerLike:
         if self.tokenizer is None:
             raise ValueError(
-                "Unable to get tokenizer because skip_tokenizer_init is True"
+                "Unable to get tokenizer because `skip_tokenizer_init=True`"
             )
 
         return self.tokenizer
diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py
index b618d2347265..599725b6de91 100644
--- a/vllm/v1/engine/logprobs.py
+++ b/vllm/v1/engine/logprobs.py
@@ -12,8 +12,8 @@
     create_prompt_logprobs,
     create_sample_logprobs,
 )
-from vllm.transformers_utils.detokenizer_utils import (
-    AnyTokenizer,
+from vllm.tokenizers.detokenizer_utils import (
+    TokenizerLike,
     convert_ids_list_to_tokens,
 )
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest
@@ -28,7 +28,7 @@
 class LogprobsProcessor:
     # Tokenizer for this request,
     # None if detokenization is disabled.
-    tokenizer: AnyTokenizer | None
+    tokenizer: TokenizerLike | None
 
     # Logprobs for this request
     logprobs: SampleLogprobs | None
@@ -40,18 +40,25 @@ class LogprobsProcessor:
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer | None,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
     ) -> "LogprobsProcessor":
-        assert request.sampling_params is not None
-        num_logprobs = request.sampling_params.logprobs
-        num_prompt_logprobs = request.sampling_params.prompt_logprobs
+        sampling_params = request.sampling_params
+        assert sampling_params is not None
+        num_logprobs = sampling_params.logprobs
+        num_prompt_logprobs = sampling_params.prompt_logprobs
         return cls(
             tokenizer=tokenizer,
             cumulative_logprob=(None if num_logprobs is None else 0.0),
-            logprobs=(None if num_logprobs is None else create_sample_logprobs()),
+            logprobs=(
+                None
+                if num_logprobs is None
+                else create_sample_logprobs(sampling_params.flat_logprobs)
+            ),
             prompt_logprobs=(
-                None if num_prompt_logprobs is None else create_prompt_logprobs()
+                None
+                if num_prompt_logprobs is None
+                else create_prompt_logprobs(sampling_params.flat_logprobs)
             ),
             num_prompt_logprobs=num_prompt_logprobs,
             num_logprobs=num_logprobs,
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index d8d03f19d466..e85fbb4ee0fb 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -15,8 +15,8 @@
     RequestOutput,
 )
 from vllm.sampling_params import RequestOutputKind
+from vllm.tokenizers import TokenizerLike
 from vllm.tracing import SpanAttributes, SpanKind, Tracer, extract_trace_context
-from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import length_from_prompt_token_ids_or_embeds
 from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
@@ -104,6 +104,7 @@ def __init__(
         arrival_time: float,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
         top_p: float | None = None,
         n: int | None = None,
         temperature: float | None = None,
@@ -131,16 +132,21 @@ def __init__(
 
         self.stats = RequestStateStats(arrival_time=arrival_time) if log_stats else None
 
+        # Stream Interval
+        self.stream_interval = stream_interval
+        self.sent_tokens_offset = 0  # Offset of sent tokens
+
     @classmethod
     def from_new_request(
         cls,
-        tokenizer: AnyTokenizer,
+        tokenizer: TokenizerLike | None,
         request: EngineCoreRequest,
         prompt: str | None,
         parent_req: ParentRequest | None,
         request_index: int,
         queue: RequestOutputCollector | None,
         log_stats: bool,
+        stream_interval: int,
     ) -> "RequestState":
         if sampling_params := request.sampling_params:
             if not sampling_params.detokenize:
@@ -188,6 +194,7 @@ def from_new_request(
             arrival_time=request.arrival_time,
             queue=queue,
             log_stats=log_stats,
+            stream_interval=stream_interval,
         )
 
     def make_request_output(
@@ -205,6 +212,29 @@ def make_request_output(
             # Only the final output is required in FINAL_ONLY mode.
             return None
 
+        if self.stream_interval > 1:
+            assert self.detokenizer is not None
+
+            # Send output request only when
+            # 1. It has finished, or
+            # 2. It is the first token, or
+            # 3. It has reached the stream interval number of tokens
+            if not (
+                finished
+                or self.sent_tokens_offset == 0
+                or len(self.detokenizer.output_token_ids) - self.sent_tokens_offset
+                >= self.stream_interval
+            ):
+                return None
+
+            if self.output_kind == RequestOutputKind.DELTA:
+                # Send tokens from the offset in DELTA mode, otherwise all
+                # tokens are sent.
+                new_token_ids = self.detokenizer.output_token_ids[
+                    self.sent_tokens_offset :
+                ]
+                self.sent_tokens_offset = len(self.detokenizer.output_token_ids)
+
         request_id = self.request_id
         if pooling_output is not None:
             return self._new_request_output(
@@ -310,13 +340,21 @@ def _new_pooling_output(
 class OutputProcessor:
     """Process EngineCoreOutputs into RequestOutputs."""
 
-    def __init__(self, tokenizer: AnyTokenizer, log_stats: bool):
+    def __init__(
+        self,
+        tokenizer: TokenizerLike | None,
+        log_stats: bool,
+        stream_interval: int = 1,
+    ):
         self.log_stats = log_stats
         self.tokenizer = tokenizer
+        self.stream_interval = stream_interval
         self.request_states: dict[str, RequestState] = {}
         self.parent_requests: dict[str, ParentRequest] = {}
         self.lora_states = LoRARequestStates(log_stats)
         self.tracer: Tracer | None = None
+        self._requests_drained = asyncio.Event()
+        self._requests_drained.set()
 
     def get_num_unfinished_requests(self):
         return len(self.request_states)
@@ -324,6 +362,11 @@ def get_num_unfinished_requests(self):
     def has_unfinished_requests(self) -> bool:
         return len(self.request_states) > 0
 
+    async def wait_for_requests_to_drain(self) -> None:
+        if not self.request_states:
+            return
+        await self._requests_drained.wait()
+
     def propagate_error(self, e: Exception):
         """Propagate error to all generate() tasks."""
 
@@ -363,6 +406,8 @@ def abort_requests(
                     child_reqs = self.abort_requests(child_reqs)
                     request_ids_to_abort.extend(child_reqs)
                 self.parent_requests.pop(request_id, None)
+        if not self.request_states:
+            self._requests_drained.set()
         return request_ids_to_abort
 
     def add_request(
@@ -385,7 +430,10 @@ def add_request(
             request_index=request_index,
             queue=queue,
             log_stats=self.log_stats,
+            stream_interval=self.stream_interval,
         )
+        if self._requests_drained.is_set():
+            self._requests_drained.clear()
         self.request_states[request_id] = req_state
         if parent_req:
             self.parent_requests[parent_req.request_id] = parent_req
@@ -477,6 +525,8 @@ def process_outputs(
                 parent_req = req_state.parent_req
                 if parent_req and not parent_req.child_requests:
                     self.parent_requests.pop(parent_req.request_id, None)
+                if not self.request_states:
+                    self._requests_drained.set()
                 if not engine_core_output.finished:
                     # If req not finished in EngineCore, but Detokenizer
                     # detected stop string, abort needed in EngineCore.
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index f2d992403e1a..bc5c7fc400fd 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -1,582 +1,20 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import warnings
 
-import time
-from collections.abc import Mapping
-from typing import Any, Literal, cast
 
-from vllm.config import VllmConfig
-from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs
-from vllm.inputs.parse import split_enc_dec_inputs
-from vllm.inputs.preprocess import InputPreprocessor
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.multimodal.cache import processor_cache_from_config
-from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict
-from vllm.multimodal.processing import EncDecMultiModalProcessor
-from vllm.multimodal.utils import argsort_mm_positions
-from vllm.pooling_params import PoolingParams
-from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import length_from_prompt_token_ids_or_embeds
-from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.metrics.stats import MultiModalCacheStats
-from vllm.v1.structured_output.backend_guidance import validate_guidance_grammar
-from vllm.v1.structured_output.backend_lm_format_enforcer import (
-    validate_structured_output_request_lm_format_enforcer,
-)
-from vllm.v1.structured_output.backend_outlines import (
-    validate_structured_output_request_outlines,
-)
-from vllm.v1.structured_output.backend_xgrammar import validate_xgrammar_grammar
+def __getattr__(name: str):
+    if name == "Processor":
+        from .input_processor import InputProcessor
 
-logger = init_logger(__name__)
-
-
-class Processor:
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        tokenizer: AnyTokenizer | None,
-        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.structured_outputs_config = vllm_config.structured_outputs_config
-
-        self.generation_config_fields = self.model_config.try_get_generation_config()
-
-        self.mm_registry = mm_registry
-        self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
-
-        self.input_preprocessor = InputPreprocessor(
-            self.model_config,
-            tokenizer,
-            mm_registry,
-            mm_processor_cache=self.mm_processor_cache,
-        )
-
-    @property
-    def tokenizer(self) -> AnyTokenizer | None:
-        return self.input_preprocessor.tokenizer
-
-    @tokenizer.setter
-    def tokenizer(self, tokenizer: AnyTokenizer | None) -> None:
-        self.input_preprocessor.tokenizer = tokenizer
-
-    def _validate_logprobs(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        max_logprobs = self.model_config.max_logprobs
-        if max_logprobs == -1:
-            max_logprobs = self.model_config.get_vocab_size()
-
-        # Validate sample logprobs.
-        if params.logprobs:
-            num_logprobs = params.logprobs
-            if num_logprobs == -1:
-                num_logprobs = self.model_config.get_vocab_size()
-            if num_logprobs > max_logprobs:
-                raise ValueError(
-                    f"Requested sample logprobs of {num_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
-                )
-
-        # Validate prompt logprobs.
-        if params.prompt_logprobs:
-            num_prompt_logprobs = params.prompt_logprobs
-            if num_prompt_logprobs == -1:
-                num_prompt_logprobs = self.model_config.get_vocab_size()
-            if num_prompt_logprobs > max_logprobs:
-                raise ValueError(
-                    f"Requested prompt logprobs of {num_prompt_logprobs}, "
-                    f"which is greater than max allowed: {max_logprobs}"
-                )
-
-    def _validate_sampling_params(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        self._validate_structured_output(params)
-        self._validate_logit_bias(params)
-
-        if params.allowed_token_ids is None:
-            return
-        if not params.allowed_token_ids:
-            raise ValueError("allowed_token_ids is not None and empty!")
-        if self.tokenizer is None:
-            # When skip_tokenizer_init=True, we can't validate token IDs
-            # Skip validation and let the model handle invalid tokens
-            return
-        vocab_size = len(self.tokenizer)
-        if not all(0 <= tid < vocab_size for tid in params.allowed_token_ids):
-            raise ValueError("allowed_token_ids contains out-of-vocab token id!")
-
-    def _validate_logit_bias(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        """Validate logit_bias token IDs are within vocabulary range."""
-        if not params.logit_bias:
-            return
-
-        vocab_size = self.model_config.get_vocab_size()
-        invalid_token_ids = []
-
-        for token_id in params.logit_bias:
-            if token_id < 0 or token_id >= vocab_size:
-                invalid_token_ids.append(token_id)
-
-        if invalid_token_ids:
-            raise ValueError(
-                f"token_id(s) {invalid_token_ids} in logit_bias contain "
-                f"out-of-vocab token ids. Vocabulary size: {vocab_size}"
-            )
-
-    def _validate_supported_sampling_params(
-        self,
-        params: SamplingParams,
-    ) -> None:
-        # Best of not yet supported.
-        if params.best_of is not None and params.best_of > 1:
-            raise ValueError("vLLM V1 does not yet support best_of.")
-        # Logits processors not supported.
-        if params.logits_processors:
-            raise ValueError(
-                "vLLM V1 does not support per request user provided logits processors."
-            )
-
-    def _validate_params(
-        self,
-        params: SamplingParams | PoolingParams,
-    ):
-        """
-        Validate supported SamplingParam.
-        Should raise ValueError if unsupported for API Server.
-        """
-
-        if isinstance(params, PoolingParams):
-            return
-
-        self._validate_logprobs(params)
-        self._validate_sampling_params(params)
-        self._validate_supported_sampling_params(params)
-
-    def _validate_multi_modal_uuids(self, prompt: PromptType) -> None:
-        """
-        Validate that user-provided multi_modal_uuids align with
-        multi_modal_data in the incoming request prompt(s).
-        Only checks lengths; `None` entries are allowed and will be
-        auto-hashed downstream.
-        """
-
-        def _validate_single_prompt(single_prompt: dict | str) -> None:
-            if not isinstance(single_prompt, dict):
-                return
-            mm_data = single_prompt.get("multi_modal_data")
-            mm_uuids = single_prompt.get("multi_modal_uuids")
-            if not mm_data or not mm_uuids:
-                return
-
-            for modality, items in mm_data.items():
-                if modality in mm_uuids:
-                    data_len = len(items) if isinstance(items, list) else 1
-                    uuid_len = (
-                        len(mm_uuids[modality])
-                        if isinstance(mm_uuids[modality], list)
-                        else 1
-                    )
-                    if uuid_len != data_len:
-                        raise ValueError(
-                            f"multi_modal_uuids for modality '{modality}' "
-                            "must have same length as data: got "
-                            f"{uuid_len} uuids vs "
-                            f"{data_len} items."
-                        )
-                else:
-                    raise ValueError(
-                        f"multi_modal_uuids for modality '{modality}' must "
-                        "be provided if multi_modal_data is provided."
-                    )
-
-        # Handle explicit encoder/decoder prompts or singleton prompt
-        if isinstance(prompt, dict) and "encoder_prompt" in prompt:
-            enc = prompt.get("encoder_prompt")
-            dec = prompt.get("decoder_prompt")
-            if enc is not None:
-                _validate_single_prompt(cast(dict | str, enc))
-            if dec is not None:
-                _validate_single_prompt(cast(dict | str, dec))
-        else:
-            _validate_single_prompt(prompt)  # type: ignore[arg-type]
-
-    def _validate_lora(self, lora_request: LoRARequest | None) -> None:
-        if lora_request is None:
-            return
-
-        # LoRA request passed in while LoRA is not enabled
-        if not self.lora_config:
-            raise ValueError(
-                f"Got lora_request {lora_request} but LoRA is not enabled!"
-            )
-
-        if self.tokenizer is not None:
-            logger.warning_once(
-                "vLLM has deprecated support for supporting different "
-                "tokenizers for different LoRAs. By default, vLLM uses base "
-                "model's tokenizer. If you are using a LoRA "
-                "with its own tokenizer, consider specifying `--tokenizer "
-                "[lora_path]` to use the LoRA tokenizer."
-            )
-
-    def _validate_structured_output(self, params: SamplingParams) -> None:
-        if not params.structured_outputs or not self.structured_outputs_config:
-            return
-
-        if self.model_config.skip_tokenizer_init and params.structured_outputs:
-            raise ValueError(
-                "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
-            )
-
-        backend = self.structured_outputs_config.backend
-        if _backend := params.structured_outputs._backend:
-            # Request-level backend selection is not supported.
-            # The values may differ if `params` is reused and was set
-            # to a specific backend based on `auto` behavior in a previous
-            # request. We remember that it was set as a result of `auto`
-            # using the `_backend_was_auto` field set in the params.
-            if backend != _backend and not (
-                backend == "auto" and params.structured_outputs._backend_was_auto
-            ):
-                raise ValueError(
-                    "Request-level structured output backend selection is not "
-                    f"supported. The request specified '{_backend}', but vLLM "
-                    f"was initialised with '{backend}'. This error can be "
-                    "resolved by removing '_backend' from the request."
-                )
-        else:
-            params.structured_outputs._backend = backend
-
-        # Request content validation
-        if (
-            isinstance(params.structured_outputs.choice, list)
-            and not params.structured_outputs.choice
-        ):
-            # It is invalid for choice to be an empty list
-            raise ValueError(
-                f"Choice '{params.structured_outputs.choice}' cannot be an empty list"  # noqa: E501
-            )
-        # Reject empty string grammar early to avoid engine-side crashes
-        if (
-            isinstance(params.structured_outputs.grammar, str)
-            and params.structured_outputs.grammar.strip() == ""
-        ):
-            raise ValueError("structured_outputs.grammar cannot be an empty string")
-
-        if backend.startswith("xgrammar"):
-            # xgrammar with no fallback
-            validate_xgrammar_grammar(params)
-        elif backend.startswith("guidance"):
-            # TODO: ideally we would have the LLTokenizer here as Lark syntax
-            # allows <|special_token|> and similar, see
-            # https://github.com/guidance-ai/llguidance/blob/main/docs/syntax.md#special-tokens
-            # Without tokenizer these are disallowed in grammars.
-            validate_guidance_grammar(params, tokenizer=None)
-        elif backend == "outlines":
-            # outlines backend
-            validate_structured_output_request_outlines(params)
-        elif backend == "lm-format-enforcer":
-            # lm format enforcer backend
-            validate_structured_output_request_lm_format_enforcer(params)
-        else:
-            # NOTE: backend must be "auto" here, because we have
-            # checked supported_backends above.
-            # In this mode, we set opinionated defaults based on what we think
-            # will satisfy the most use cases without having to worry about
-            # this setting. We include fallback behavior here, but not with any
-            # other setting where a specific backend was specified.
-            try:
-                validate_xgrammar_grammar(params)
-                params.structured_outputs._backend = "xgrammar"
-            except ValueError:
-                # The request either failed validation
-                # or includes some jsonschema feature(s) that
-                # are not supported in xgrammar. Fall back to guidance.
-                validate_guidance_grammar(params, tokenizer=None)
-                params.structured_outputs._backend = "guidance"
-            # Remember that this backend was set automatically
-            params.structured_outputs._backend_was_auto = True
-
-    def _maybe_build_mm_uuids(
-        self,
-        request_id: str,
-        prompt: PromptType,
-    ) -> MultiModalUUIDDict | None:
-        """Build per-item multimodal hash overrides when enabled. In this case,
-        multimodal data items are identified by their request id, modality and
-        index rather than their content.
-
-        Returns a dictionary of modality -> list[str] of overrides, or None if
-        disabled or no multimodal data is present.
-        """
-
-        def _extract_mm_data(p: PromptType):
-            if isinstance(p, dict) and "encoder_prompt" in p:
-                enc = p.get("encoder_prompt")
-                if isinstance(enc, dict):
-                    return enc.get("multi_modal_data")
-                return None
-            if isinstance(p, dict):
-                return p.get("multi_modal_data")
-            return None
-
-        mm_data = _extract_mm_data(prompt)
-        if not mm_data:
-            return None
-
-        mm_uuids: dict[str, list[str | None] | str] = {}
-        for modality, data in mm_data.items():
-            n = len(data) if isinstance(data, list) else 1
-            mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
-        return mm_uuids
-
-    def process_inputs(
-        self,
-        request_id: str,
-        prompt: PromptType,
-        params: SamplingParams | PoolingParams,
-        arrival_time: float | None = None,
-        lora_request: LoRARequest | None = None,
-        tokenization_kwargs: dict[str, Any] | None = None,
-        trace_headers: Mapping[str, str] | None = None,
-        priority: int = 0,
-        data_parallel_rank: int | None = None,
-    ) -> EngineCoreRequest:
-        self._validate_lora(lora_request)
-        self._validate_params(params)
-
-        data_parallel_size = self.vllm_config.parallel_config.data_parallel_size
-        if data_parallel_rank is not None and not (
-            0 <= data_parallel_rank < data_parallel_size
-        ):
-            raise ValueError(
-                f"data_parallel_rank {data_parallel_rank} "
-                f"is out of range [0, {data_parallel_size})."
-            )
-
-        if arrival_time is None:
-            arrival_time = time.time()
-
-        # Optionally generate multimodal hash overrides to avoid hashing
-        # multimodal data items by their content as their identifiers.
-
-        # NOTE: when users explicitly turn off BOTH prefix caching and input
-        # processing caching, no multimodal features or embeddings will be
-        # reused across requests, therefore identifying multimodal data items
-        # by their content is no longer necessary, and we create uuids with
-        # request id-modality-index as multimodal hash overrides.
-        if (
-            self.model_config.multimodal_config
-            and self.model_config.multimodal_config.mm_processor_cache_gb == 0
-            and not self.cache_config.enable_prefix_caching
-        ):
-            mm_uuids = self._maybe_build_mm_uuids(request_id, prompt)
-        else:
-            # Otherwise, use user-provided uuids as multimodal hash overrides
-            # if provided.
-            self._validate_multi_modal_uuids(prompt)
-            if isinstance(prompt, dict):
-                mm_uuids = cast(
-                    MultiModalUUIDDict | None, prompt.get("multi_modal_uuids")
-                )
-            else:
-                mm_uuids = None
-
-        # Process inputs, which includes:
-        # 1. Tokenize text prompt, with LoRA request if one exists.
-        # 2. For multimodal models with a merged preprocessor, preprocess
-        #   multimodal data and expand prompt token ids accordingly.
-        processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
-            prompt,
-            tokenization_kwargs=tokenization_kwargs,
-            mm_uuids=mm_uuids,
+        warnings.warn(
+            "`vllm.v1.engine.processor.Processor` has been moved to "
+            "`vllm.v1.engine.input_processor.InputProcessor`. "
+            "The old name will be removed in v0.13.",
+            DeprecationWarning,
+            stacklevel=2,
         )
-        from vllm.platforms import current_platform
-
-        current_platform.validate_request(
-            prompt=prompt,
-            params=params,
-            processed_inputs=processed_inputs,
-        )
-
-        eos_token_id = self.input_preprocessor.get_eos_token_id()
-
-        encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs)
-        self._validate_model_inputs(encoder_inputs, decoder_inputs)
-
-        # Mypy can be conservative for TypedDict unions; normalize access.
-        if decoder_inputs["type"] == "embeds":
-            prompt_token_ids = None
-            prompt_embeds = decoder_inputs["prompt_embeds"]
-        else:
-            prompt_token_ids = decoder_inputs["prompt_token_ids"]
-            prompt_embeds = None
-
-        sampling_params = None
-        pooling_params = None
-        if isinstance(params, SamplingParams):
-            # TODO: can we avoid cloning here in multiproc case?
-            sampling_params = params.clone()
-            # If unset max tokens, then generate up to the max_model_len.
-            if sampling_params.max_tokens is None:
-                seq_len = length_from_prompt_token_ids_or_embeds(
-                    prompt_token_ids, prompt_embeds
-                )
-                sampling_params.max_tokens = self.model_config.max_model_len - seq_len
-            sampling_params.update_from_generation_config(
-                self.generation_config_fields, eos_token_id
-            )
-            if self.tokenizer is not None:
-                sampling_params.update_from_tokenizer(self.tokenizer)
-        else:
-            pooling_params = params.clone()
-
-        # Multimodal related.
-        mm_features: list[MultiModalFeatureSpec] | None = None
-
-        if decoder_inputs["type"] == "multimodal":
-            decoder_mm_inputs = decoder_inputs["mm_kwargs"]
-            decoder_mm_positions = decoder_inputs["mm_placeholders"]
-            decoder_mm_hashes = decoder_inputs["mm_hashes"]
-
-            # Merge and flatten multimodal placeholders, hashes and inputs
-            # from dictionaries to lists, and sort them by each item's position
-            # in the input sequence.
-            sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions)
-
-            mm_features = []
-            for modality, idx in sorted_mm_idxs:
-                mm_features.append(
-                    MultiModalFeatureSpec(
-                        data=decoder_mm_inputs[modality][idx],
-                        modality=modality,
-                        identifier=decoder_mm_hashes[modality][idx],
-                        mm_position=decoder_mm_positions[modality][idx],
-                    )
-                )
-
-        return EngineCoreRequest(
-            request_id=request_id,
-            prompt_token_ids=prompt_token_ids,
-            prompt_embeds=prompt_embeds,
-            mm_features=mm_features,
-            sampling_params=sampling_params,
-            pooling_params=pooling_params,
-            eos_token_id=eos_token_id,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            cache_salt=decoder_inputs.get("cache_salt"),
-            priority=priority,
-            data_parallel_rank=data_parallel_rank,
-            trace_headers=trace_headers,
-        )
-
-    def _validate_model_inputs(
-        self, encoder_inputs: SingletonInputs | None, decoder_inputs: SingletonInputs
-    ):
-        if encoder_inputs is not None:
-            self._validate_model_input(encoder_inputs, prompt_type="encoder")
-
-        self._validate_model_input(decoder_inputs, prompt_type="decoder")
-
-    def _validate_model_input(
-        self,
-        prompt_inputs: SingletonInputs,
-        *,
-        prompt_type: Literal["encoder", "decoder"],
-    ):
-        model_config = self.model_config
-
-        prompt_ids = (
-            None
-            if prompt_inputs["type"] == "embeds"
-            else prompt_inputs["prompt_token_ids"]
-        )
-        prompt_embeds = (
-            prompt_inputs["prompt_embeds"]
-            if prompt_inputs["type"] == "embeds"
-            else None
-        )
-        prompt_len = length_from_prompt_token_ids_or_embeds(prompt_ids, prompt_embeds)
-        if not prompt_ids:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                pass  # Mllama may have empty encoder inputs for text-only data
-            elif prompt_inputs["type"] == "embeds":
-                pass  # Prompt embeds should not have prompt_ids.
-            else:
-                raise ValueError(f"The {prompt_type} prompt cannot be empty")
-
-        tokenizer = self.tokenizer
-        if tokenizer is not None:
-            max_input_id = max(prompt_ids or [], default=0)
-
-            # NOTE: tokenizer.max_token_id is the tokenizer’s vocab size while
-            # self.model_config.get_vocab_size() is the model’s vocab size.
-            # For Qwen3 models, the language model has extra tokens that do
-            # not exist in the tokenizer, and vice versa for multimodal
-            # placeholder tokens in some multimodal models.
-            # See https://github.com/QwenLM/Qwen3/issues/29#issuecomment-1933720399 # noqa: E501
-            # and https://github.com/vllm-project/vllm/pull/22471#discussion_r2312251421 # noqa: E501
-
-            # Here we take the max of the two to determine if a token id is
-            # truly out-of-vocabulary.
-            if max_input_id > max(
-                tokenizer.max_token_id, self.model_config.get_vocab_size() - 1
-            ):
-                raise ValueError(f"Token id {max_input_id} is out of vocabulary")
-
-        max_prompt_len = self.model_config.max_model_len
-        if prompt_len > max_prompt_len:
-            if prompt_type == "encoder" and model_config.is_multimodal_model:
-                mm_registry = self.input_preprocessor.mm_registry
-                mm_processor = mm_registry.create_processor(
-                    model_config,
-                    tokenizer=tokenizer,
-                )
-                assert isinstance(mm_processor, EncDecMultiModalProcessor)
-
-                if mm_processor.pad_dummy_encoder_prompt:
-                    return  # Skip encoder length check for Whisper
-
-            if model_config.is_multimodal_model:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens plus multimodal tokens. For image "
-                    "inputs, the number of image tokens depends on the number "
-                    "of images, and possibly their aspect ratios as well."
-                )
-            else:
-                suggestion = (
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens."
-                )
-
-            raise ValueError(
-                f"The {prompt_type} prompt (length {prompt_len}) is "
-                f"longer than the maximum model length of {max_prompt_len}. "
-                f"{suggestion}"
-            )
-
-            # TODO: Find out how many placeholder tokens are there so we can
-            # check that chunked prefill does not truncate them
-            # max_batch_len = self.scheduler_config.max_num_batched_tokens
 
-    def stat_mm_cache(self) -> MultiModalCacheStats | None:
-        return self.input_preprocessor.stat_mm_cache()
+        return InputProcessor
 
-    def clear_mm_cache(self) -> None:
-        self.input_preprocessor.clear_mm_cache()
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index e74519b21aa6..24bf66c42f31 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -183,15 +183,19 @@ def set_device_control_env_var(
     for engine subprocess.
     """
     world_size = vllm_config.parallel_config.world_size
+    local_world_size = vllm_config.parallel_config.local_world_size
     evar = current_platform.device_control_env_var
 
-    value = get_device_indices(evar, local_dp_rank, world_size)
+    value = get_device_indices(evar, local_dp_rank, world_size, local_world_size)
     with patch.dict(os.environ, values=((evar, value),)):
         yield
 
 
 def get_device_indices(
-    device_control_env_var: str, local_dp_rank: int, world_size: int
+    device_control_env_var: str,
+    local_dp_rank: int,
+    world_size: int,
+    local_world_size: int | None = None,
 ):
     """
     Returns a comma-separated string of device indices for the specified
@@ -200,10 +204,15 @@ def get_device_indices(
     For example, if world_size=2 and local_dp_rank=1, and there are 4 devices,
     this will select devices 2 and 3 for local_dp_rank=1.
     """
+    if local_world_size is None:
+        local_world_size = world_size
     try:
         value = ",".join(
             str(current_platform.device_id_to_physical_device_id(i))
-            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) * world_size)
+            for i in range(
+                local_dp_rank * world_size,
+                local_dp_rank * world_size + local_world_size,
+            )
         )
     except IndexError as e:
         raise Exception(
@@ -362,8 +371,7 @@ def create_dp_placement_groups(
         )
         assert len(nodes) > 0, "No nodes with resources found in Ray cluster."
         assert dp_master_ip_key in nodes[0], (
-            "The DP master node (ip: %s) is missing or dead",
-            dp_master_ip,
+            f"The DP master node (ip: {dp_master_ip}) is missing or dead"
         )
         device_str = current_platform.ray_device_key
         n_node_devices: list[int] = [
@@ -437,8 +445,7 @@ def create_dp_placement_groups(
                 if key != "node:__internal_head__" and key.startswith("node:")
             ]
             assert len(node_ip_keys) == 1, (
-                "Zero or multiple node IP keys found in node resources: %s",
-                node_ip_keys,
+                f"Zero or multiple node IP keys found in node resources: {node_ip_keys}"
             )
             node_ip_key = node_ip_keys[0]
             node_ip = node_ip_key.split(":")[1]
@@ -455,11 +462,9 @@ def create_dp_placement_groups(
             if node_ip == dp_master_ip:
                 if dp_size_available < dp_size_local:
                     raise ValueError(
-                        "Not enough resources to allocate %s DP ranks "
-                        "on DP master node %s, possible to fit %s DP ranks",
-                        dp_size_local,
-                        dp_master_ip,
-                        dp_size_available,
+                        f"Not enough resources to allocate {dp_size_local} DP ranks "
+                        f"on DP master node {dp_master_ip}, possible to fit "
+                        f"{dp_size_available} DP ranks."
                     )
                 dp_size_to_allocate = dp_size_local
             elif pack_strategy == "strict":
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 1e249161c688..7e8ebe25c460 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -10,7 +10,7 @@
 import traceback
 import weakref
 from collections import deque
-from collections.abc import Callable
+from collections.abc import Callable, Sequence
 from concurrent.futures import Future, InvalidStateError
 from contextlib import suppress
 from dataclasses import dataclass
@@ -31,8 +31,11 @@
 from vllm.distributed.device_communicators.shm_broadcast import Handle, MessageQueue
 from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator
 from vllm.distributed.parallel_state import (
+    get_dcp_group,
     get_dp_group,
     get_ep_group,
+    get_inner_dp_world_group,
+    get_pcp_group,
     get_pp_group,
     get_tp_group,
 )
@@ -89,6 +92,10 @@ def wait_for_response(self, get_response: Callable):
 class MultiprocExecutor(Executor):
     supports_pp: bool = True
 
+    def __init__(self, vllm_config: VllmConfig, monitor_workers: bool = True):
+        self.monitor_workers = monitor_workers
+        super().__init__(vllm_config)
+
     def _init_executor(self) -> None:
         # Call self.shutdown at exit to clean up
         # and ensure workers will be terminated.
@@ -98,12 +105,20 @@ def _init_executor(self) -> None:
         self.failure_callback: FailureCallback | None = None
 
         self.world_size = self.parallel_config.world_size
-        tensor_parallel_size = self.parallel_config.tensor_parallel_size
-        pp_parallel_size = self.parallel_config.pipeline_parallel_size
-        assert self.world_size == tensor_parallel_size * pp_parallel_size, (
+        assert self.world_size % self.parallel_config.nnodes_within_dp == 0, (
+            f"global world_size ({self.parallel_config.world_size}) must be "
+            f"divisible by nnodes_within_dp "
+            f"({self.parallel_config.nnodes_within_dp}). "
+        )
+        self.local_world_size = self.parallel_config.local_world_size
+        tp_size = self.parallel_config.tensor_parallel_size
+        pp_size = self.parallel_config.pipeline_parallel_size
+        pcp_size = self.parallel_config.prefill_context_parallel_size
+        assert self.world_size == tp_size * pp_size * pcp_size, (
             f"world_size ({self.world_size}) must be equal to the "
-            f"tensor_parallel_size ({tensor_parallel_size}) x pipeline"
-            f"_parallel_size ({pp_parallel_size}). "
+            f"tensor_parallel_size ({tp_size}) x pipeline"
+            f"_parallel_size ({pp_size}) x prefill_context"
+            f"_parallel_size ({pcp_size}). "
         )
 
         # Set multiprocessing envs
@@ -115,27 +130,37 @@ def _init_executor(self) -> None:
         distributed_init_method = get_distributed_init_method(
             get_loopback_ip(), get_open_port()
         )
-
+        self.rpc_broadcast_mq: MessageQueue | None = None
+        scheduler_output_handle: Handle | None = None
         # Initialize worker and set up message queues for SchedulerOutputs
         # and ModelRunnerOutputs
-        max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
-        self.rpc_broadcast_mq = MessageQueue(
-            self.world_size, self.world_size, max_chunk_bytes=max_chunk_bytes
-        )
-        scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
-
+        if self.parallel_config.node_rank_within_dp == 0:
+            # For leader node within each dp rank,
+            # each dp will have its own leader multiproc executor.
+            max_chunk_bytes = envs.VLLM_MQ_MAX_CHUNK_BYTES_MB * 1024 * 1024
+            self.rpc_broadcast_mq = MessageQueue(
+                self.world_size,
+                self.local_world_size,
+                max_chunk_bytes=max_chunk_bytes,
+                connect_ip=self.parallel_config.master_addr,
+            )
+            scheduler_output_handle = self.rpc_broadcast_mq.export_handle()
         # Create workers
         context = get_mp_context()
         shared_worker_lock = context.Lock()
         unready_workers: list[UnreadyWorkerProcHandle] = []
         success = False
         try:
-            for rank in range(self.world_size):
+            global_start_rank = (
+                self.local_world_size * self.parallel_config.node_rank_within_dp
+            )
+            for local_rank in range(self.local_world_size):
+                global_rank = global_start_rank + local_rank
                 unready_workers.append(
                     WorkerProc.make_worker_process(
                         vllm_config=self.vllm_config,
-                        local_rank=rank,
-                        rank=rank,
+                        local_rank=local_rank,
+                        rank=global_rank,
                         distributed_init_method=distributed_init_method,
                         input_shm_handle=scheduler_output_handle,
                         shared_worker_lock=shared_worker_lock,
@@ -144,15 +169,38 @@ def _init_executor(self) -> None:
 
             # Workers must be created before wait_for_ready to avoid
             # deadlock, since worker.init_device() does a device sync.
+
+            # Wait for all local workers to be ready.
             self.workers = WorkerProc.wait_for_ready(unready_workers)
 
+            # Start background thread to monitor worker health if not in headless mode.
+            if self.monitor_workers:
+                self.start_worker_monitor()
+
+            self.response_mqs = []
+            # Only leader node have remote response mqs
+            if self.parallel_config.node_rank_within_dp == 0:
+                for rank in range(self.world_size):
+                    if rank < self.local_world_size:
+                        local_message_queue = self.workers[rank].worker_response_mq
+                        assert local_message_queue is not None
+                        self.response_mqs.append(local_message_queue)
+                    else:
+                        remote_message_queue = self.workers[0].peer_worker_response_mqs[
+                            rank
+                        ]
+                        assert remote_message_queue is not None
+                        self.response_mqs.append(remote_message_queue)
+
             # Ensure message queues are ready. Will deadlock if re-ordered
             # Must be kept consistent with the WorkerProc.
-            self.rpc_broadcast_mq.wait_until_ready()
-            for w in self.workers:
-                w.worker_response_mq.wait_until_ready()
 
-            self.start_worker_monitor()
+            # Wait for all input mqs to be ready.
+            if self.rpc_broadcast_mq is not None:
+                self.rpc_broadcast_mq.wait_until_ready()
+            # Wait for all remote response mqs to be ready.
+            for response_mq in self.response_mqs:
+                response_mq.wait_until_ready()
             success = True
         finally:
             if not success:
@@ -167,7 +215,7 @@ def _init_executor(self) -> None:
 
         self.output_rank = self._get_output_rank()
 
-    def start_worker_monitor(self):
+    def start_worker_monitor(self, inline=False) -> None:
         workers = self.workers
         self_ref = weakref.ref(self)
 
@@ -191,9 +239,13 @@ def monitor_workers():
                 _self.failure_callback = None
                 callback()
 
-        Thread(
-            target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
-        ).start()
+        if not inline:
+            Thread(
+                target=monitor_workers, daemon=True, name="MultiprocWorkerMonitor"
+            ).start()
+            return
+
+        monitor_workers()
 
     def register_failure_callback(self, callback: FailureCallback):
         if self.is_failed:
@@ -246,7 +298,9 @@ def collective_rpc(  # type: ignore[override]
     ) -> Any | list[Any] | Future[Any | list[Any]]:
         """Returns single result if unique_reply_rank and/or kv_output_aggregator
         is provided, otherwise list."""
-
+        assert self.rpc_broadcast_mq is not None, (
+            "collective_rpc should not be called on follower node"
+        )
         if self.is_failed:
             raise RuntimeError("Executor failed.")
 
@@ -268,20 +322,20 @@ def collective_rpc(  # type: ignore[override]
             send_method = cloudpickle.dumps(method, protocol=pickle.HIGHEST_PROTOCOL)
         self.rpc_broadcast_mq.enqueue((send_method, args, kwargs, output_rank))
 
-        workers = (
-            (self.workers[output_rank],) if output_rank is not None else self.workers
-        )
+        response_mqs: Sequence[MessageQueue] = self.response_mqs
+        if output_rank is not None:
+            response_mqs = (response_mqs[output_rank],)
 
         shutdown_event = self.shutdown_event
 
         def get_response():
             responses = []
-            for w in workers:
+            for mq in response_mqs:
                 dequeue_timeout = (
                     None if deadline is None else (deadline - time.monotonic())
                 )
                 try:
-                    status, result = w.worker_response_mq.dequeue(
+                    status, result = mq.dequeue(
                         timeout=dequeue_timeout, cancel=shutdown_event
                     )
                 except TimeoutError as e:
@@ -373,7 +427,11 @@ def _get_output_rank(self) -> int:
         # 16-23, PP rank 2
         # 24-31, PP rank 3
         # so world_size - tp_size = 32 - 8 = 24 should be PP rank = -1 (i.e. 3)
-        return self.world_size - self.parallel_config.tensor_parallel_size
+        return (
+            self.world_size
+            - self.parallel_config.tensor_parallel_size
+            * self.parallel_config.prefill_context_parallel_size
+        )
 
 
 @dataclass
@@ -390,17 +448,26 @@ class UnreadyWorkerProcHandle:
 class WorkerProcHandle:
     proc: BaseProcess
     rank: int
-    worker_response_mq: MessageQueue  # The worker process writes to this MQ
+    # The worker process writes to this MQ in single-node mode
+    worker_response_mq: MessageQueue | None
+    # This is only non empty on driver node,
+    # the peer worker process i writes to MQ
+    # `peer_worker_response_mqs[i]`
+    peer_worker_response_mqs: list[MessageQueue | None]
     death_writer: Connection | None = None
 
     @classmethod
     def from_unready_handle(
-        cls, unready_handle: UnreadyWorkerProcHandle, worker_response_mq: MessageQueue
+        cls,
+        unready_handle: UnreadyWorkerProcHandle,
+        worker_response_mq: MessageQueue | None,
+        peer_worker_response_mqs: list[MessageQueue | None],
     ) -> "WorkerProcHandle":
         return cls(
             proc=unready_handle.proc,
             rank=unready_handle.rank,
             worker_response_mq=worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
             death_writer=unready_handle.death_writer,
         )
 
@@ -410,6 +477,38 @@ class WorkerProc:
 
     READY_STR = "READY"
 
+    def _init_message_queues(
+        self, input_shm_handle: Handle, vllm_config: VllmConfig
+    ) -> None:
+        if vllm_config.parallel_config.nnodes_within_dp == 1:
+            # Initialize MessageQueue for receiving SchedulerOutput
+            self.rpc_broadcast_mq = MessageQueue.create_from_handle(
+                input_shm_handle, self.worker.rank
+            )
+
+            # Initializes a message queue for sending the model output
+            self.worker_response_mq: MessageQueue = MessageQueue(1, 1)
+            self.peer_response_handles = []
+        else:
+            # Initialize remote MessageQueue for receiving SchedulerOutput across nodes
+            self.rpc_broadcast_mq = get_inner_dp_world_group().create_mq_broadcaster(
+                external_writer_handle=input_shm_handle,
+                # Since there is external_writer_handle from executor proc,
+                # where the ready signal from actual writer is sent out of the
+                # create_mq_broadcaster method and after this setup, we make it
+                # non blocking. The handshake will be triggered when
+                # worker.rpc_broadcast_mq.wait_until_ready() is called
+                blocking=False,
+            )
+            # Initializes remote message queue for sending the model output to the
+            # driver worker, exposing peer_response_handles for driver worker
+            # that include handles for all ranks
+            self.worker_response_mq, self.peer_response_handles = (
+                get_inner_dp_world_group().create_single_reader_mq_broadcasters(
+                    reader_rank_in_group=0
+                )
+            )
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -420,13 +519,15 @@ def __init__(
         shared_worker_lock: LockType,
     ):
         self.rank = rank
-        wrapper = WorkerWrapperBase(vllm_config=vllm_config, rpc_rank=rank)
+        wrapper = WorkerWrapperBase(
+            vllm_config=vllm_config, rpc_rank=local_rank, global_rank=rank
+        )
         # TODO: move `init_worker` to executor level as a collective rpc call
         all_kwargs: list[dict] = [
             {} for _ in range(vllm_config.parallel_config.world_size)
         ]
         is_driver_worker = rank % vllm_config.parallel_config.tensor_parallel_size == 0
-        all_kwargs[rank] = {
+        all_kwargs[local_rank] = {
             "vllm_config": vllm_config,
             "local_rank": local_rank,
             "rank": rank,
@@ -437,14 +538,6 @@ def __init__(
         wrapper.init_worker(all_kwargs)
         self.worker = wrapper
 
-        # Initialize MessageQueue for receiving SchedulerOutput
-        self.rpc_broadcast_mq = MessageQueue.create_from_handle(
-            input_shm_handle, self.worker.rank
-        )
-
-        # Initializes a message queue for sending the model output
-        self.worker_response_mq = MessageQueue(1, 1)
-
         scheduler_config = vllm_config.scheduler_config
         self.use_async_scheduling = scheduler_config.async_scheduling
         if self.use_async_scheduling:
@@ -465,6 +558,7 @@ def __init__(
         )
 
         # Load model
+        self._init_message_queues(input_shm_handle, vllm_config)
         self.worker.load_model()
 
         # Enable environment variable cache (e.g. assume no more
@@ -511,6 +605,27 @@ def make_worker_process(
         # death_reader in child will get EOFError
         return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
 
+    @staticmethod
+    def wait_for_response_handle_ready(
+        handles: dict[str, Any], proc_handle: UnreadyWorkerProcHandle
+    ) -> WorkerProcHandle:
+        response_handle = handles["handle"]
+        worker_response_mq: MessageQueue | None = None
+        if len(response_handle.local_reader_ranks) > 0:
+            worker_response_mq = MessageQueue.create_from_handle(response_handle, 0)
+        peer_response_handles = handles["peer_response_handles"]
+        peer_worker_response_mqs = [
+            MessageQueue.create_from_handle(handle, -1)
+            if handle.remote_subscribe_addr is not None
+            else None
+            for handle in peer_response_handles
+        ]
+        return WorkerProcHandle.from_unready_handle(
+            proc_handle,
+            worker_response_mq,
+            peer_worker_response_mqs=peer_worker_response_mqs,
+        )
+
     @staticmethod
     def wait_for_ready(
         unready_proc_handles: list[UnreadyWorkerProcHandle],
@@ -536,16 +651,10 @@ def wait_for_ready(
                     if response["status"] != "READY":
                         raise e
 
-                    # Extract the message queue handle.
-                    worker_response_mq = MessageQueue.create_from_handle(
-                        response["handle"], 0
-                    )
-                    ready_proc_handles[unready_proc_handle.rank] = (
-                        WorkerProcHandle.from_unready_handle(
-                            unready_proc_handle, worker_response_mq
-                        )
+                    idx = unready_proc_handle.rank % len(ready_proc_handles)
+                    ready_proc_handles[idx] = WorkerProc.wait_for_response_handle_ready(
+                        response, unready_proc_handle
                     )
-
                 except EOFError:
                     e.__suppress_context__ = True
                     raise e from None
@@ -617,12 +726,14 @@ def monitor_parent_death():
                 {
                     "status": WorkerProc.READY_STR,
                     "handle": worker.worker_response_mq.export_handle(),
+                    "peer_response_handles": worker.peer_response_handles,
                 }
             )
 
             # Ensure message queues are ready. Will deadlock if re-ordered.
             # Must be kept consistent with the Executor
-            worker.rpc_broadcast_mq.wait_until_ready()
+            if worker.rpc_broadcast_mq is not None:
+                worker.rpc_broadcast_mq.wait_until_ready()
             worker.worker_response_mq.wait_until_ready()
             ready_writer.close()
             ready_writer = None
@@ -724,15 +835,23 @@ def setup_proc_title_and_log_prefix(enable_ep: bool) -> None:
         dp_rank = get_dp_group().rank_in_group
         pp_size = get_pp_group().world_size
         pp_rank = get_pp_group().rank_in_group
+        pcp_size = get_pcp_group().world_size
+        pcp_rank = get_pcp_group().rank_in_group
         tp_size = get_tp_group().world_size
         tp_rank = get_tp_group().rank_in_group
+        dcp_size = get_dcp_group().world_size
+        dcp_rank = get_dcp_group().rank_in_group
         process_name = "Worker"
         if dp_size > 1:
             process_name += f"_DP{dp_rank}"
         if pp_size > 1:
             process_name += f"_PP{pp_rank}"
+        if pcp_size > 1:
+            process_name += f"_PCP{pcp_rank}"
         if tp_size > 1:
             process_name += f"_TP{tp_rank}"
+        if dcp_size > 1:
+            process_name += f"_DCP{dcp_rank}"
         if enable_ep:
             ep_rank = get_ep_group().rank_in_group
             process_name += f"_EP{ep_rank}"
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 119e4c081831..406eafcd339b 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -99,6 +99,11 @@ def _init_executor(self) -> None:
         # KV connector setup
         self.has_connector = self.vllm_config.kv_transfer_config is not None
 
+        self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and (
+            self.vllm_config.ec_transfer_config is None
+            or not self.vllm_config.ec_transfer_config.is_ec_producer
+        )
+
         self.scheduler_output: SchedulerOutput | None = None
 
     @property
@@ -395,6 +400,12 @@ def execute_model(  # type: ignore[override]
                 "State error: sample_tokens() must be called "
                 "after execute_model() returns None."
             )
+
+        if not self.uses_sampler or not scheduler_output.total_num_scheduled_tokens:
+            # Model will not execute, call model runner immediately.
+            return self._execute_dag(scheduler_output, None, non_block)
+
+        # Model will execute, defer to sample_tokens() call.
         self.scheduler_output = scheduler_output
         return COMPLETED_NONE_FUTURE if non_block else None
 
@@ -417,10 +428,18 @@ def sample_tokens(  # type: ignore[override]
         """
         scheduler_output = self.scheduler_output
         if scheduler_output is None:
-            return None  # noqa
+            return COMPLETED_NONE_FUTURE if non_block else None  # noqa
 
         self.scheduler_output = None
 
+        return self._execute_dag(scheduler_output, grammar_output, non_block)
+
+    def _execute_dag(
+        self,
+        scheduler_output: SchedulerOutput,
+        grammar_output: "GrammarOutput | None",
+        non_block: bool = False,
+    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
         # Build the compiled DAG for the first time.
         if self.forward_dag is None:  # type: ignore
             self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py
index 7f33eb7e699c..751862aa9c76 100644
--- a/vllm/v1/kv_cache_interface.py
+++ b/vllm/v1/kv_cache_interface.py
@@ -95,10 +95,11 @@ class FullAttentionSpec(AttentionSpec):
     def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int:
         max_model_len = vllm_config.model_config.max_model_len
         dcp_world_size = vllm_config.parallel_config.decode_context_parallel_size
+        pcp_world_size = vllm_config.parallel_config.prefill_context_parallel_size
         # Note(hc): each dcp rank only need save
         # (max_model_len//dcp_world_size) tokens locally.
-        if dcp_world_size > 1:
-            max_model_len = cdiv(max_model_len, dcp_world_size)
+        if dcp_world_size * pcp_world_size > 1:
+            max_model_len = cdiv(max_model_len, dcp_world_size * pcp_world_size)
         return cdiv(max_model_len, self.block_size) * self.page_size_bytes
 
     @classmethod
diff --git a/vllm/v1/kv_offload/cpu.py b/vllm/v1/kv_offload/cpu.py
index 4b1bbe6f0cc2..2f2e85c0ff33 100644
--- a/vllm/v1/kv_offload/cpu.py
+++ b/vllm/v1/kv_offload/cpu.py
@@ -4,8 +4,8 @@
 
 import torch
 
-from vllm.config import VllmConfig, get_layers_from_vllm_config
-from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.arc_manager import ARCOffloadingManager
@@ -63,7 +63,9 @@ def get_manager(self) -> OffloadingManager:
         return self._manager
 
     def get_handlers(
-        self, kv_caches: dict[str, torch.Tensor]
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         if not self._handler:
             if not current_platform.is_cuda_alike():
@@ -71,15 +73,6 @@ def get_handlers(
                     "CPU Offloading is currently only supported on CUDA-alike GPUs"
                 )
 
-            layer_names = list(kv_caches.keys())
-            layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase, layer_names
-            )
-            attn_backends = {
-                layer_name: layers[layer_name].get_attn_backend()
-                for layer_name in layer_names
-            }
-
             self._handler = CpuGpuOffloadingHandler(
                 attn_backends=attn_backends,
                 gpu_block_size=self.gpu_block_size,
diff --git a/vllm/v1/kv_offload/spec.py b/vllm/v1/kv_offload/spec.py
index a3c539a47d45..2cdd5ba5ffe5 100644
--- a/vllm/v1/kv_offload/spec.py
+++ b/vllm/v1/kv_offload/spec.py
@@ -6,6 +6,7 @@
 
 import torch
 
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.v1.kv_offload.abstract import LoadStoreSpec, OffloadingManager
 from vllm.v1.kv_offload.worker.worker import OffloadingHandler
@@ -48,13 +49,16 @@ def get_manager(self) -> OffloadingManager:
 
     @abstractmethod
     def get_handlers(
-        self, kv_caches: dict[str, torch.Tensor]
+        self,
+        kv_caches: dict[str, torch.Tensor],
+        attn_backends: dict[str, type[AttentionBackend]],
     ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]:
         """
         Get offloading handlers along with their respective src and dst types.
 
         Args:
             kv_caches: A dictionary of layer_name -> gpu_kv_cache tensor.
+            attn_backends: A dictionary of layer_name -> AttentionBackend.
 
         Yields:
             Tuples of (src_type, dst_type, offloading_handler).
diff --git a/vllm/v1/kv_offload/worker/cpu_gpu.py b/vllm/v1/kv_offload/worker/cpu_gpu.py
index 646f9d0d7542..461458c1f6ce 100644
--- a/vllm/v1/kv_offload/worker/cpu_gpu.py
+++ b/vllm/v1/kv_offload/worker/cpu_gpu.py
@@ -5,7 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.attention import AttentionBackend
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.logger import init_logger
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
@@ -68,9 +68,9 @@ def __init__(
         self.h2d_stream = torch.cuda.Stream()
 
         # job_id -> transfer cuda event
-        self.transfer_events: dict[int, torch.cuda.Event] = {}
+        self.transfer_events: dict[int, torch.Event] = {}
         # list of cuda events available for re-use
-        self.events_pool: list[torch.cuda.Event] = []
+        self.events_pool: list[torch.Event] = []
 
         pin_memory = is_pin_memory_available()
 
@@ -83,10 +83,18 @@ def __init__(
             self.gpu_tensors.append(gpu_tensor)
 
             gpu_shape = gpu_tensor.shape
-            test_shape = attn_backends[layer_name].get_kv_cache_shape(
+            attn_backend = attn_backends[layer_name]
+            test_shape = attn_backend.get_kv_cache_shape(
                 num_blocks=1234, block_size=16, num_kv_heads=8, head_size=256
             )
-            if test_shape[0] == 1234:
+
+            if len(gpu_shape) != len(test_shape):
+                # cross-layers tensor
+                # shape is (num_blocks, ...)
+                assert len(gpu_shape) == len(test_shape) + 1
+                num_blocks_idx = 0
+                self.kv_dim_before_num_blocks.append(False)
+            elif test_shape[0] == 1234:
                 # shape is (num_blocks, ...)
                 num_blocks_idx = 0
                 self.kv_dim_before_num_blocks.append(False)
@@ -135,25 +143,23 @@ def transfer_async(self, job_id: int, spec: TransferSpec) -> bool:
         assert src_blocks.ndim == 1
         assert dst_blocks.ndim == 1
 
-        dst_sub_blocks_to_skip = -src_blocks.size % dst_block_size_factor
         src_sub_block_count = src_blocks.size * src_block_size_factor
+        dst_sub_block_count = dst_blocks.size * dst_block_size_factor
+        src_sub_blocks_to_skip = -dst_blocks.size % src_block_size_factor
 
-        assert (
-            src_sub_block_count
-            == dst_blocks.size * dst_block_size_factor - dst_sub_blocks_to_skip
-        )
+        assert dst_sub_block_count == src_sub_block_count - src_sub_blocks_to_skip
 
-        src_to_dst = np.empty((src_sub_block_count, 2), dtype=np.int64)
-        expand_block_ids(src_blocks, src_block_size_factor, src_to_dst[:, 0])
+        src_to_dst = np.empty((dst_sub_block_count, 2), dtype=np.int64)
         expand_block_ids(
-            dst_blocks,
-            dst_block_size_factor,
-            src_to_dst[:, 1],
-            skip_count=dst_sub_blocks_to_skip,
+            src_blocks,
+            src_block_size_factor,
+            src_to_dst[:, 0],
+            skip_count=src_sub_blocks_to_skip,
         )
+        expand_block_ids(dst_blocks, dst_block_size_factor, src_to_dst[:, 1])
         src_to_dst_tensor = torch.from_numpy(src_to_dst)
 
-        event = self.events_pool.pop() if self.events_pool else torch.cuda.Event()
+        event = self.events_pool.pop() if self.events_pool else torch.Event()
         with torch.cuda.stream(stream):
             for src_tensor, dst_tensor, kv_dim in zip(
                 src_tensors, dst_tensors, self.kv_dim_before_num_blocks
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 1a175e9e110b..429cee3b5af1 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -16,7 +16,7 @@
     KVConnectorPrometheus,
 )
 from vllm.logger import init_logger
-from vllm.plugins import load_plugins_by_group
+from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.prometheus import unregister_vllm_metrics
 from vllm.v1.metrics.stats import (
@@ -67,7 +67,7 @@ def record_sleep_state(self, is_awake: int, level: int):  # noqa
 def load_stat_logger_plugin_factories() -> list[StatLoggerFactory]:
     factories: list[StatLoggerFactory] = []
 
-    for name, plugin_class in load_plugins_by_group("vllm.stat_logger_plugins").items():
+    for name, plugin_class in load_plugins_by_group(STAT_LOGGER_PLUGINS_GROUP).items():
         if not isinstance(plugin_class, type) or not issubclass(
             plugin_class, StatLoggerBase
         ):
@@ -104,8 +104,8 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         self.mm_caching_metrics = CachingMetrics()
 
         self.spec_decoding_logging = SpecDecodingLogging()
-        kv_tranfer_config = self.vllm_config.kv_transfer_config
-        self.kv_connector_logging = KVConnectorLogging(kv_tranfer_config)
+        kv_transfer_config = self.vllm_config.kv_transfer_config
+        self.kv_connector_logging = KVConnectorLogging(kv_transfer_config)
         self.last_prompt_throughput: float = 0.0
         self.last_generation_throughput: float = 0.0
         self.engine_is_idle = False
@@ -118,12 +118,14 @@ def _reset(self, now):
         self.num_prompt_tokens: int = 0
         self.num_generation_tokens: int = 0
         self.num_corrupted_reqs: int = 0
+        self.num_preemptions: int = 0
 
     def _track_iteration_stats(self, iteration_stats: IterationStats):
         # Save tracked stats for token counters.
         self.num_prompt_tokens += iteration_stats.num_prompt_tokens
         self.num_generation_tokens += iteration_stats.num_generation_tokens
         self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs
+        self.num_preemptions += iteration_stats.num_preempted_reqs
 
     def _get_throughput(self, tracked_stats: int, now: float) -> float:
         # Compute summary metrics for tracked stats
@@ -196,18 +198,31 @@ def log(self):
             "Avg generation throughput: %.1f tokens/s",
             "Running: %d reqs",
             "Waiting: %d reqs",
-            "GPU KV cache usage: %.1f%%",
-            "Prefix cache hit rate: %.1f%%",
         ]
         log_args = [
             self.last_prompt_throughput,
             self.last_generation_throughput,
             self.last_scheduler_stats.num_running_reqs,
             self.last_scheduler_stats.num_waiting_reqs,
-            self.last_scheduler_stats.kv_cache_usage * 100,
-            self.prefix_caching_metrics.hit_rate * 100,
         ]
 
+        if self.num_preemptions > 0:
+            log_parts.append("Preemptions: %d")
+            log_args.append(self.num_preemptions)
+
+        log_parts.extend(
+            [
+                "GPU KV cache usage: %.1f%%",
+                "Prefix cache hit rate: %.1f%%",
+            ]
+        )
+        log_args.extend(
+            [
+                self.last_scheduler_stats.kv_cache_usage * 100,
+                self.prefix_caching_metrics.hit_rate * 100,
+            ]
+        )
+
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             log_parts.append("Corrupted: %d reqs")
             log_args.append(self.num_corrupted_reqs)
@@ -365,7 +380,7 @@ def __init__(
         model_name = vllm_config.model_config.served_model_name
         max_model_len = vllm_config.model_config.max_model_len
 
-        per_engine_labelvalues: dict[int, list[str]] = {
+        per_engine_labelvalues: dict[int, list[object]] = {
             idx: [model_name, str(idx)] for idx in engine_indexes
         }
 
@@ -425,60 +440,10 @@ def __init__(
         # Setting default values
         self.record_sleep_state()
 
-        # GPU cache
-        #
-        # Deprecated in 0.9.2 - Renamed as vllm:kv_cache_usage_perc
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            gauge_gpu_cache_usage = self._gauge_cls(
-                name="vllm:gpu_cache_usage_perc",
-                documentation=(
-                    "GPU KV-cache usage. 1 means 100 percent usage."
-                    "DEPRECATED: Use vllm:kv_cache_usage_perc instead."
-                ),
-                multiprocess_mode="mostrecent",
-                labelnames=labelnames,
-            )
-            self.gauge_gpu_cache_usage = make_per_engine(
-                gauge_gpu_cache_usage, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_queries
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_queries = self._counter_cls(
-                name="vllm:gpu_prefix_cache_queries",
-                documentation=(
-                    "GPU prefix cache queries, in terms of number of queried"
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_queries = make_per_engine(
-                counter_gpu_prefix_cache_queries, engine_indexes, model_name
-            )
-
-        # Deprecated in 0.9.2 - Renamed as vllm:prefix_cache_hits
-        # With 0.11.x you can enable with --show-hidden-metrics-for-version=0.10
-        # TODO: remove in 0.12.0
-        if self.show_hidden_metrics:
-            counter_gpu_prefix_cache_hits = self._counter_cls(
-                name="vllm:gpu_prefix_cache_hits",
-                documentation=(
-                    "GPU prefix cache hits, in terms of number of cached "
-                    "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."
-                ),
-                labelnames=labelnames,
-            )
-            self.counter_gpu_prefix_cache_hits = make_per_engine(
-                counter_gpu_prefix_cache_hits, engine_indexes, model_name
-            )
-
         gauge_kv_cache_usage = self._gauge_cls(
             name="vllm:kv_cache_usage_perc",
             documentation="KV-cache usage. 1 means 100 percent usage.",
+            multiprocess_mode="mostrecent",
             labelnames=labelnames,
         )
         self.gauge_kv_cache_usage = make_per_engine(
@@ -719,39 +684,41 @@ def __init__(
         )
 
         # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
-        # TODO: in 0.12, only enable if show_hidden_metrics=True
-        histogram_time_per_output_token = self._histogram_cls(
-            name="vllm:time_per_output_token_seconds",
-            documentation=(
-                "Histogram of time per output token in seconds."
-                "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
-            ),
-            buckets=[
-                0.01,
-                0.025,
-                0.05,
-                0.075,
-                0.1,
-                0.15,
-                0.2,
-                0.3,
-                0.4,
-                0.5,
-                0.75,
-                1.0,
-                2.5,
-                5.0,
-                7.5,
-                10.0,
-                20.0,
-                40.0,
-                80.0,
-            ],
-            labelnames=labelnames,
-        )
-        self.histogram_time_per_output_token = make_per_engine(
-            histogram_time_per_output_token, engine_indexes, model_name
-        )
+        # With 0.12.x you can enable with --show-hidden-metrics-for-version=0.11
+        # TODO: remove in 0.13.0
+        if self.show_hidden_metrics:
+            histogram_time_per_output_token = self._histogram_cls(
+                name="vllm:time_per_output_token_seconds",
+                documentation=(
+                    "Histogram of time per output token in seconds."
+                    "DEPRECATED: Use vllm:inter_token_latency_seconds instead."
+                ),
+                buckets=[
+                    0.01,
+                    0.025,
+                    0.05,
+                    0.075,
+                    0.1,
+                    0.15,
+                    0.2,
+                    0.3,
+                    0.4,
+                    0.5,
+                    0.75,
+                    1.0,
+                    2.5,
+                    5.0,
+                    7.5,
+                    10.0,
+                    20.0,
+                    40.0,
+                    80.0,
+                ],
+                labelnames=labelnames,
+            )
+            self.histogram_time_per_output_token = make_per_engine(
+                histogram_time_per_output_token, engine_indexes, model_name
+            )
 
         histogram_inter_token_latency = self._histogram_cls(
             name="vllm:inter_token_latency_seconds",
@@ -950,20 +917,8 @@ def record(
             self.gauge_scheduler_waiting[engine_idx].set(
                 scheduler_stats.num_waiting_reqs
             )
-            if self.show_hidden_metrics:
-                self.gauge_gpu_cache_usage[engine_idx].set(
-                    scheduler_stats.kv_cache_usage
-                )
             self.gauge_kv_cache_usage[engine_idx].set(scheduler_stats.kv_cache_usage)
 
-            if self.show_hidden_metrics:
-                self.counter_gpu_prefix_cache_queries[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.queries
-                )
-                self.counter_gpu_prefix_cache_hits[engine_idx].inc(
-                    scheduler_stats.prefix_cache_stats.hits
-                )
-
             self.counter_prefix_cache_queries[engine_idx].inc(
                 scheduler_stats.prefix_cache_stats.queries
             )
@@ -1034,7 +989,8 @@ def record(
             self.histogram_time_to_first_token[engine_idx].observe(ttft)
         for itl in iteration_stats.inter_token_latencies_iter:
             self.histogram_inter_token_latency[engine_idx].observe(itl)
-            self.histogram_time_per_output_token[engine_idx].observe(itl)
+            if self.show_hidden_metrics:
+                self.histogram_time_per_output_token[engine_idx].observe(itl)
 
         for finished_request in iteration_stats.finished_requests:
             self.counter_request_success[finished_request.finish_reason][
@@ -1096,7 +1052,7 @@ def log_engine_initialized(self):
 
 
 def make_per_engine(
-    metric: PromMetric, engine_idxs: list[int], model_name: str
+    metric: PromMetric, engine_idxs: list[int], model_name: object
 ) -> dict[int, PromMetric]:
     return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}
 
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
index e32d5bb608b1..8110deb5a610 100644
--- a/vllm/v1/outputs.py
+++ b/vllm/v1/outputs.py
@@ -29,27 +29,15 @@ class LogprobsLists(NamedTuple):
     # different for each request.
     cu_num_generated_tokens: list[int] | None = None
 
-    def slice(self, start_req_idx: int, end_req_idx: int):
-        if self.cu_num_generated_tokens:
-            start = self.cu_num_generated_tokens[start_req_idx]
-            end = self.cu_num_generated_tokens[end_req_idx]
-            # Recompute cumulative array starting from 0
-            cu_num_offset = self.cu_num_generated_tokens[start_req_idx]
-            sliced_cu_num_generated_tokens = [
-                cu_num - cu_num_offset
-                for cu_num in self.cu_num_generated_tokens[
-                    start_req_idx : end_req_idx + 1
-                ]
-            ]
-        else:
-            start = start_req_idx
-            end = end_req_idx
-            sliced_cu_num_generated_tokens = None
+    def slice_request(self, req_idx: int, num_positions: int):
+        if self.cu_num_generated_tokens is not None:
+            req_idx = self.cu_num_generated_tokens[req_idx]
+        end_idx = req_idx + num_positions
         return LogprobsLists(
-            self.logprob_token_ids[start:end],
-            self.logprobs[start:end],
-            self.sampled_token_ranks[start:end],
-            sliced_cu_num_generated_tokens,
+            self.logprob_token_ids[req_idx:end_idx],
+            self.logprobs[req_idx:end_idx],
+            self.sampled_token_ranks[req_idx:end_idx],
+            None,
         )
 
 
diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py
index 9883ab8fb996..7bd2c7415daf 100644
--- a/vllm/v1/pool/metadata.py
+++ b/vllm/v1/pool/metadata.py
@@ -67,16 +67,16 @@ def build_pooling_cursor(
 
     n_seq = len(num_scheduled_tokens)
     index = list(range(n_seq))
-    num_scheduled_tokens = torch.tensor(num_scheduled_tokens, device="cpu")
+    num_scheduled_tokens_cpu = torch.tensor(num_scheduled_tokens, device="cpu")
     cumsum = torch.zeros(
         n_seq + 1, dtype=torch.int64, pin_memory=pin_memory, device="cpu"
     )
-    torch.cumsum(num_scheduled_tokens, dim=0, out=cumsum[1:])
+    torch.cumsum(num_scheduled_tokens_cpu, dim=0, out=cumsum[1:])
     cumsum = cumsum.to(device, non_blocking=True)
     return PoolingCursor(
         index=index,
         first_token_indices_gpu=cumsum[:n_seq],
         last_token_indices_gpu=cumsum[1:] - 1,
         prompt_lens_cpu=prompt_lens,
-        num_scheduled_tokens_cpu=num_scheduled_tokens,
+        num_scheduled_tokens_cpu=num_scheduled_tokens_cpu,
     )
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 7a5f1183ed48..366cdadf5a58 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -121,12 +121,17 @@ def __init__(
         # The number of requests being preempted by the scheduler
         self.num_preemptions = 0
 
+        # The number of tokens that have been computed remotely.
+        self.num_external_computed_tokens = 0
+
         self.block_hashes: list[BlockHash] = []
         self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
         if block_hasher is not None:
             self.get_hash_new_full_blocks = partial(block_hasher, self)
             self.block_hashes = self.get_hash_new_full_blocks()
 
+        self.skip_reading_prefix_cache = self.get_skip_reading_prefix_cache()
+
     @classmethod
     def from_engine_core_request(
         cls,
@@ -180,6 +185,19 @@ def num_tokens_with_spec(self) -> int:
     def num_output_tokens(self) -> int:
         return len(self._output_token_ids)
 
+    def get_skip_reading_prefix_cache(self) -> bool:
+        if (
+            self.sampling_params is not None
+            and self.sampling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.sampling_params.skip_reading_prefix_cache
+        elif (
+            self.pooling_params is not None
+            and self.pooling_params.skip_reading_prefix_cache is not None
+        ):
+            return self.pooling_params.skip_reading_prefix_cache
+        return False
+
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
 
diff --git a/vllm/v1/sample/logits_processor/__init__.py b/vllm/v1/sample/logits_processor/__init__.py
index 5992c4066c9c..f7b70645fd18 100644
--- a/vllm/v1/sample/logits_processor/__init__.py
+++ b/vllm/v1/sample/logits_processor/__init__.py
@@ -41,7 +41,7 @@
 # Error message when the user tries to initialize vLLM with a speculative
 # decoding enabled and custom logitsproces
 STR_SPEC_DEC_REJECTS_LOGITSPROCS = (
-    "Custom logits processors are not supportedwhen speculative decoding is enabled."
+    "Custom logits processors are not supported when speculative decoding is enabled."
 )
 
 LOGITSPROCS_GROUP = "vllm.logits_processors"
@@ -313,7 +313,7 @@ def _new_state(
                 if (len(inspect.signature(req_lp).parameters) == 3)
                 else [output_ids]
             )
-            return partial(req_lp, *args)
+            return partial(req_lp, *args)  # type: ignore[misc]
         return None
 
     def update_state(self, batch_update: BatchUpdate | None):
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 02ea658b7f20..c9229e788b6b 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -7,6 +7,7 @@
 from packaging import version
 
 from vllm import envs
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config.model import LogprobsMode
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform
@@ -32,6 +33,16 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
             and current_platform.is_cuda()
         ):
             if envs.VLLM_USE_FLASHINFER_SAMPLER:
+                from vllm.v1.attention.backends.flashinfer import FlashInferBackend
+
+                capability = current_platform.get_device_capability()
+                assert capability is not None
+                if not FlashInferBackend.supports_compute_capability(capability):
+                    capability_str = capability.as_version_str()
+                    raise RuntimeError(
+                        "FlashInfer does not support compute capability "
+                        f"{capability_str}, unset VLLM_USE_FLASHINFER_SAMPLER=1."
+                    )
                 # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
                 logger.info_once(
                     "Using FlashInfer for top-p & top-k sampling.",
@@ -55,6 +66,24 @@ def __init__(self, logprobs_mode: LogprobsMode = "raw_logprobs") -> None:
                 self.forward = self.forward_native
             else:
                 self.forward = self.forward_cpu
+        elif (
+            logprobs_mode not in ("processed_logits", "processed_logprobs")
+            and rocm_aiter_ops.is_enabled()
+        ):
+            try:
+                import aiter.ops.sampling  # noqa: F401
+
+                self.aiter_ops = torch.ops.aiter
+                logger.info_once(
+                    "Using aiter sampler on ROCm (lazy import, sampling-only)."
+                )
+                self.forward = self.forward_hip
+            except ImportError:
+                logger.warning_once(
+                    "aiter.ops.sampling is not available on ROCm. "
+                    "Falling back to forward_native implementation."
+                )
+                self.forward = self.forward_native
         else:
             self.forward = self.forward_native
 
@@ -138,6 +167,64 @@ def forward_cpu(
 
             return probs.div_(q).argmax(dim=-1).view(-1), logits_to_return
 
+    def forward_hip(
+        self,
+        logits: torch.Tensor,
+        generators: dict[int, torch.Generator],
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """Optimized ROCm/aiter path (same structure as forward_cuda)."""
+        if (k is None and p is None) or generators:
+            if generators:
+                logger.warning_once(
+                    "aiter sampler does not support per-request generators; "
+                    "falling back to PyTorch-native."
+                )
+            return self.forward_native(logits, generators, k, p)
+        assert self.logprobs_mode not in (
+            "processed_logits",
+            "processed_logprobs",
+        ), "aiter sampler does not support returning logits/logprobs."
+        return self.aiter_sample(logits, k, p, generators), None
+
+    def aiter_sample(
+        self,
+        logits: torch.Tensor,
+        k: torch.Tensor | None,
+        p: torch.Tensor | None,
+        generators: dict[int, torch.Generator],
+    ) -> torch.Tensor:
+        """Sample from logits using aiter ops."""
+        use_top_k = k is not None
+        use_top_p = p is not None
+        # Joint k+p path
+        if use_top_p and use_top_k:
+            probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous()
+            next_token_ids = self.aiter_ops.top_k_top_p_sampling_from_probs(
+                probs,
+                None,
+                *_to_tensor_scalar_tuple(k),
+                *_to_tensor_scalar_tuple(p),
+                deterministic=True,
+            )
+            return next_token_ids.view(-1)
+        # Top-p only path
+        elif use_top_p:
+            probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous()
+            next_token_ids = self.aiter_ops.top_p_sampling_from_probs(
+                probs, None, *_to_tensor_scalar_tuple(p), deterministic=True
+            )
+            return next_token_ids.view(-1)
+        # Top-k only path
+        elif use_top_k:
+            probs = logits.softmax(dim=-1, dtype=torch.float32).contiguous()
+            renorm_probs = self.aiter_ops.top_k_renorm_probs(
+                probs, *_to_tensor_scalar_tuple(k)
+            )
+            return torch.multinomial(renorm_probs, num_samples=1).view(-1)
+        raise RuntimeError("aiter_sample was called with no active top-k or top-p.")
+
 
 # Note: this is a workaround for
 # https://github.com/pytorch/pytorch/pull/151218
@@ -288,3 +375,10 @@ def flashinfer_sample(
         )
 
     return next_token_ids.view(-1)
+
+
+def _to_tensor_scalar_tuple(x):
+    if isinstance(x, torch.Tensor):
+        return (x, 0)
+    else:
+        return (None, x)
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
index 926305d25f56..ccaf07e18c46 100644
--- a/vllm/v1/sample/rejection_sampler.py
+++ b/vllm/v1/sample/rejection_sampler.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Sequence
 from dataclasses import replace
 
 import torch
@@ -204,7 +205,9 @@ def _get_logprobs_tensors(
     def parse_output(
         output_token_ids: torch.Tensor,
         vocab_size: int,
-    ) -> list[list[int]]:
+        discard_req_indices: Sequence[int] = (),
+        return_cu_num_tokens: bool = False,
+    ) -> tuple[list[list[int]], list[int] | None]:
         """Parse the output of the rejection sampler.
         Args:
             output_token_ids: The sampled token IDs in shape
@@ -212,6 +215,8 @@ def parse_output(
                 replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
                 and will be filtered out in this function.
             vocab_size: The size of the vocabulary.
+            discard_req_indices: Optional row indices to discard tokens in.
+            return_cu_num_tokens: Whether to also return cumulative token counts.
         Returns:
             A list of lists of token IDs.
         """
@@ -220,10 +225,15 @@ def parse_output(
         valid_mask = (output_token_ids_np != PLACEHOLDER_TOKEN_ID) & (
             output_token_ids_np < vocab_size
         )
+        cu_num_tokens = None
+        if return_cu_num_tokens:
+            cu_num_tokens = [0] + valid_mask.sum(axis=1).cumsum().tolist()
+        if len(discard_req_indices) > 0:
+            valid_mask[discard_req_indices] = False
         outputs = [
             row[valid_mask[i]].tolist() for i, row in enumerate(output_token_ids_np)
         ]
-        return outputs
+        return outputs, cu_num_tokens
 
     def apply_logits_processors(
         self,
diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py
index 39c63fe31ad2..c75b4f0543c0 100644
--- a/vllm/v1/sample/sampler.py
+++ b/vllm/v1/sample/sampler.py
@@ -81,7 +81,10 @@ def forward(
             if logprobs_mode == "raw_logprobs":
                 raw_logprobs = self.compute_logprobs(logits)
             elif logprobs_mode == "raw_logits":
-                raw_logprobs = logits.clone()
+                if logits.dtype == torch.float32:
+                    raw_logprobs = logits.clone()
+                else:
+                    raw_logprobs = logits.to(torch.float32)
 
         # Use float32 for the logits.
         logits = logits.to(torch.float32)
diff --git a/vllm/v1/sample/tpu/sampler.py b/vllm/v1/sample/tpu/sampler.py
index 8f0463c76ce1..6d992bb37a59 100644
--- a/vllm/v1/sample/tpu/sampler.py
+++ b/vllm/v1/sample/tpu/sampler.py
@@ -181,7 +181,7 @@ def apply_top_k_top_p(
     after thresholding the logit using this cut-off, the remaining elements
     shall constitute the top-p set.
 
-    Note: in the case of tie (i.e. multipple cut-off elements present in the
+    Note: in the case of tie (i.e. multiple cut-off elements present in the
     logit), all tie elements are included in the top-p set. In other words,
     this function does not break ties. Instead, these tie tokens have equal
     chance of being chosen during final sampling, so we can consider the tie
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index 102357ca7c64..0a6806390451 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -8,7 +8,7 @@
 from functools import partial
 from inspect import isclass
 from types import FunctionType
-from typing import Any, TypeAlias
+from typing import Any, TypeAlias, get_type_hints
 
 import cloudpickle
 import msgspec
@@ -16,6 +16,8 @@
 import torch
 import zmq
 from msgspec import msgpack
+from pydantic import GetCoreSchemaHandler
+from pydantic_core import core_schema
 
 from vllm import envs
 from vllm.logger import init_logger
@@ -31,7 +33,7 @@
     MultiModalSharedField,
     NestedTensors,
 )
-from vllm.v1.engine import UtilityResult
+from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.utils import tensor_data
 
 logger = init_logger(__name__)
@@ -103,6 +105,13 @@ def _decode_type_info_recursive(
     return convert_fn(type_info, data)
 
 
+class UtilityResult:
+    """Wrapper for special handling when serializing/deserializing."""
+
+    def __init__(self, r: Any = None):
+        self.result = r
+
+
 class MsgpackEncoder:
     """Encoder with custom torch tensor and numpy array serialization.
 
@@ -282,7 +291,9 @@ class MsgpackDecoder:
     not thread-safe when encoding tensors / numpy arrays.
     """
 
-    def __init__(self, t: Any | None = None):
+    def __init__(self, t: Any | None = None, share_mem: bool = True):
+        self.share_mem = share_mem
+        self.pin_tensors = is_pin_memory_available()
         args = () if t is None else (t,)
         self.decoder = msgpack.Decoder(
             *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook
@@ -347,21 +358,30 @@ def _decode_ndarray(self, arr: Any) -> np.ndarray:
         # zero-copy decode. We assume the ndarray will not be kept around,
         # as it now locks the whole received message buffer in memory.
         buffer = self.aux_buffers[data] if isinstance(data, int) else data
-        return np.frombuffer(buffer, dtype=dtype).reshape(shape)
+        arr = np.frombuffer(buffer, dtype=dtype)
+        if not self.share_mem:
+            arr = arr.copy()
+        return arr.reshape(shape)
 
     def _decode_tensor(self, arr: Any) -> torch.Tensor:
         dtype, shape, data = arr
-        # Copy from inline representation, to decouple the memory storage
-        # of the message from the original buffer. And also make Torch
-        # not complain about a readonly memoryview.
-        buffer = self.aux_buffers[data] if isinstance(data, int) else bytearray(data)
+        is_aux = isinstance(data, int)
+        buffer = self.aux_buffers[data] if is_aux else data
+        buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer)
         torch_dtype = getattr(torch, dtype)
         assert isinstance(torch_dtype, torch.dtype)
-        if not buffer:  # torch.frombuffer doesn't like empty buffers
+        if not buffer.nbytes:  # torch.frombuffer doesn't like empty buffers
             assert 0 in shape
             return torch.empty(shape, dtype=torch_dtype)
         # Create uint8 array
         arr = torch.frombuffer(buffer, dtype=torch.uint8)
+        # Clone ensures tensor is backed by pytorch-owned memory for safe
+        # future async CPU->GPU transfer.
+        # Pin larger tensors for more efficient CPU->GPU transfer.
+        if not is_aux:
+            arr = arr.clone()
+        elif not self.share_mem:
+            arr = arr.pin_memory() if self.pin_tensors else arr.clone()
         # Convert back to proper shape & type
         return arr.view(torch_dtype).view(shape)
 
@@ -457,3 +477,56 @@ def run_method(
     else:
         func = partial(method, obj)  # type: ignore
     return func(*args, **kwargs)
+
+
+class PydanticMsgspecMixin:
+    @classmethod
+    def __get_pydantic_core_schema__(
+        cls, source_type: Any, handler: GetCoreSchemaHandler
+    ) -> core_schema.CoreSchema:
+        """
+        Make msgspec.Struct compatible with Pydantic, respecting defaults.
+        Handle JSON=>msgspec.Struct. Used when exposing msgspec.Struct to the
+        API as input or in `/docs`. Note this is cached by Pydantic and not
+        called on every validation.
+        """
+        msgspec_fields = {f.name: f for f in msgspec.structs.fields(source_type)}
+        type_hints = get_type_hints(source_type)
+
+        # Build the Pydantic typed_dict_field for each msgspec field
+        fields = {}
+        for name, hint in type_hints.items():
+            msgspec_field = msgspec_fields[name]
+
+            # typed_dict_field using the handler to get the schema
+            field_schema = handler(hint)
+
+            # Add default value to the schema.
+            if msgspec_field.default_factory is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default_factory=msgspec_field.default_factory,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            elif msgspec_field.default is not msgspec.NODEFAULT:
+                wrapped_schema = core_schema.with_default_schema(
+                    schema=field_schema,
+                    default=msgspec_field.default,
+                )
+                fields[name] = core_schema.typed_dict_field(wrapped_schema)
+            else:
+                # No default, so Pydantic will treat it as required
+                fields[name] = core_schema.typed_dict_field(field_schema)
+        return core_schema.no_info_after_validator_function(
+            cls._validate_msgspec,
+            core_schema.typed_dict_schema(fields),
+        )
+
+    @classmethod
+    def _validate_msgspec(cls, value: Any) -> Any:
+        """Validate and convert input to msgspec.Struct instance."""
+        if isinstance(value, cls):
+            return value
+        if isinstance(value, dict):
+            return cls(**value)
+        return msgspec.convert(value, type=cls)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index beef5203e039..d7111d52dd8a 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -8,6 +8,7 @@
 import torch
 import torch.nn as nn
 
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import (
     CompilationMode,
     CUDAGraphMode,
@@ -24,6 +25,7 @@
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.platforms import current_platform
+from vllm.triton_utils import triton
 from vllm.utils.platform_utils import is_pin_memory_available
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.tree_attn import (
@@ -39,7 +41,12 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.sampler import _SAMPLING_EPS
 from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+from vllm.v1.spec_decode.utils import (
+    eagle_prepare_inputs_padded_kernel,
+    eagle_prepare_next_token_padded_kernel,
+)
 from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 
 logger = init_logger(__name__)
@@ -65,6 +72,7 @@ def __init__(
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
+        self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.num_speculative_tokens = self.speculative_config.num_speculative_tokens
         self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
         self.token_arange_np = np.arange(self.max_num_tokens)
@@ -72,6 +80,7 @@ def __init__(
         # the draft model's hidden size can be different from the target model's
         # hidden size (e.g., Llama 3.3 70B).
         self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
 
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
@@ -83,12 +92,15 @@ def __init__(
         self.draft_indexer_metadata_builder: AttentionMetadataBuilder | None = None
         self.attn_layer_names: list[str] = []
         self.indexer_layer_names: list[str] = []
+        self.eagle3_use_aux_hidden_state: bool = (
+            self._get_eagle3_use_aux_hidden_state_from_config()
+        )
 
         self.use_cuda_graph = False
 
-        compilation_config = self.vllm_config.compilation_config
-        if compilation_config.mode == CompilationMode.VLLM_COMPILE:
-            cudagraph_mode = compilation_config.cudagraph_mode
+        self.compilation_config = self.vllm_config.compilation_config
+        if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
             if cudagraph_mode != CUDAGraphMode.NONE and not cudagraph_mode.has_mode(
                 CUDAGraphMode.PIECEWISE
             ):
@@ -103,22 +115,24 @@ def __init__(
                 and not self.speculative_config.enforce_eager
             )
 
-        self.cudagraph_batch_sizes = (
-            (sorted(self.vllm_config.compilation_config.cudagraph_capture_sizes))
-            if self.use_cuda_graph
-            else []
-        )
-
-        self.use_cuda_graph = self.use_cuda_graph and bool(self.cudagraph_batch_sizes)
         # persistent buffers for cuda graph
         self.input_ids = torch.zeros(
             self.max_num_tokens, dtype=torch.int32, device=device
         )
         self.uses_mrope = self.vllm_config.model_config.uses_mrope
         if self.uses_mrope:
-            # M-RoPE need (3, max_num_tokens)
+            # NOTE: `mrope_positions` is implemented with one additional dummy
+            # position on purpose to make it non-contiguous so that it can work
+            # with torch compile.
+            # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923
+
+            # NOTE: When M-RoPE is enabled, position ids are 3D regardless of
+            # the modality of inputs. For text-only inputs, each dimension has
+            # identical position IDs, making M-RoPE functionally equivalent to
+            # 1D-RoPE.
+            # See page 5 of https://arxiv.org/abs/2409.12191
             self.mrope_positions = torch.zeros(
-                (3, self.max_num_tokens), dtype=torch.int64, device=device
+                (3, self.max_num_tokens + 1), dtype=torch.int64, device=device
             )
         else:
             # RoPE need (max_num_tokens,)
@@ -138,7 +152,9 @@ def __init__(
         )
 
         self.inputs_embeds = torch.zeros(
-            (self.max_num_tokens, self.hidden_size), dtype=self.dtype, device=device
+            (self.max_num_tokens, self.inputs_embeds_size),
+            dtype=self.dtype,
+            device=device,
         )
 
         self.backup_next_token_ids = CpuGpuBuffer(
@@ -150,8 +166,6 @@ def __init__(
         )
 
         # Determine allowed attention backends once during initialization.
-        from vllm.attention.backends.registry import AttentionBackendEnum
-
         self.allowed_attn_types: tuple | None = None
         if current_platform.is_rocm():
             rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata]
@@ -266,12 +280,24 @@ def propose(
             assert draft_indexer_metadata is not None
             per_layer_attn_metadata[layer_name] = draft_indexer_metadata
 
+        num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=num_tokens,
+            num_tokens_padded=num_tokens,
+        )
+
         cudagraph_runtime_mode = CUDAGraphMode.NONE
-        if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
-            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
+        if (
+            self.use_cuda_graph
+            and num_tokens_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
+        ):
+            num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            num_input_tokens = num_tokens
+            num_input_tokens = num_tokens_dp_padded
+        if num_tokens_across_dp is not None:
+            num_tokens_across_dp[self.dp_rank] = num_input_tokens
+
         # copy inputs to buffer for cudagraph
         self._set_positions(num_tokens, target_positions)
         self.hidden_states[:num_tokens] = target_hidden_states
@@ -295,6 +321,7 @@ def propose(
             per_layer_attn_metadata,
             self.vllm_config,
             num_tokens=num_input_tokens,
+            num_tokens_across_dp=num_tokens_across_dp,
             cudagraph_runtime_mode=cudagraph_runtime_mode,
         ):
             ret_hidden_states = self.model(
@@ -357,12 +384,23 @@ def propose(
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
-        if self.use_cuda_graph and batch_size <= self.cudagraph_batch_sizes[-1]:
-            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size)
+        batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
+            num_tokens_unpadded=batch_size,
+            num_tokens_padded=batch_size,
+        )
+
+        if (
+            self.use_cuda_graph
+            and batch_size_dp_padded
+            <= self.compilation_config.max_cudagraph_capture_size
+        ):
+            input_batch_size = self.vllm_config.pad_for_cudagraph(batch_size_dp_padded)
             cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
         else:
-            input_batch_size = batch_size
+            input_batch_size = batch_size_dp_padded
             cudagraph_runtime_mode = CUDAGraphMode.NONE
+        if batch_size_across_dp is not None:
+            batch_size_across_dp[self.dp_rank] = input_batch_size
 
         common_attn_metadata.num_actual_tokens = batch_size
         common_attn_metadata.max_query_len = 1
@@ -397,10 +435,13 @@ def propose(
                 positions += 1
                 exceeds_max_model_len = positions >= self.max_model_len
                 clamped_positions = torch.where(exceeds_max_model_len, 0, positions)
-
+            # For data integrity when async scheduling, we shouldn't use in place
+            # operations in case they are modified in next step's `prepare_input`
+            # of main model.
             # Increment the sequence lengths.
             common_attn_metadata.seq_lens += 1
-            common_attn_metadata.seq_lens_cpu += 1
+            # This is an out-of-place operation to avoid modifying the original tensor.
+            common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1
             # For the requests that exceed the max model length, we set the
             # sequence length to 1 to minimize their overheads in attention.
 
@@ -460,6 +501,7 @@ def propose(
                 per_layer_attn_metadata,
                 self.vllm_config,
                 num_tokens=input_batch_size,
+                num_tokens_across_dp=batch_size_across_dp,
                 cudagraph_runtime_mode=cudagraph_runtime_mode,
             ):
                 ret_hidden_states = self.model(
@@ -521,20 +563,15 @@ def prepare_next_token_ids_padded(
         sampled_token_ids: torch.Tensor,
         requests: dict[str, CachedRequestState],
         gpu_input_batch: InputBatch,
-        discard_request_indices: torch.Tensor,
-        num_discarded_requests: int,
+        discard_request_mask: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding.
         It calculates the next token ids and the number of valid sampled tokens
         for each request, considering the "discarded" requests whose next token
-        is not sampled and comes from `request.get_token_id()` instead.
-        It also accounts for the rejected tokens in `sampled_token_ids`.
-        This function must use device functions to operate on the inputs, and
-        should not introduce any blocking CPU-GPU synchronization.
+        is not sampled and comes from `request.get_token_id()` instead. This is denoted
+        the "backup" token id. It also counts rejected tokens via `sampled_token_ids`.
         """
-        # TODO(Ben): Combine this into a custom fused kernel
-
         # Precompute get_token_id for when there is no valid next token
         num_reqs = gpu_input_batch.num_reqs
         self.backup_next_token_ids.np[:num_reqs] = np.array(
@@ -543,44 +580,39 @@ def prepare_next_token_ids_padded(
                     common_attn_metadata.seq_lens_cpu[i].item()
                 )
                 for i in range(num_reqs)
-            ]
+            ],
+            dtype=np.int32,
         )
         self.backup_next_token_ids.copy_to_gpu(num_reqs)
+        backup_tokens_gpu = self.backup_next_token_ids.gpu
 
-        # Mask out the sampled tokens indices that should not be sampled.
-        discard_sampled_tokens_req_indices = discard_request_indices[
-            :num_discarded_requests
-        ]
+        batch_size, num_tokens = sampled_token_ids.shape
+        device = sampled_token_ids.device
 
-        valid_sampled_token_ids_gpu = sampled_token_ids.clone()
-        valid_sampled_token_ids_gpu.index_fill_(
-            0, discard_sampled_tokens_req_indices, -1
-        )
+        assert discard_request_mask.dtype == torch.bool
+        assert backup_tokens_gpu.dtype == torch.int32
 
-        # Generate a mask for all valid tokens within those requests
-        valid_mask = (valid_sampled_token_ids_gpu != -1) & (
-            valid_sampled_token_ids_gpu < gpu_input_batch.vocab_size
+        next_token_ids = torch.empty((batch_size,), dtype=torch.int32, device=device)
+        valid_sampled_tokens_count = torch.empty(
+            (batch_size,), dtype=torch.int32, device=device
         )
 
-        # Count the number of valid tokens in each request
-        valid_sampled_tokens_count = valid_mask.sum(dim=1)
-
-        # Get the rightmost valid index per row
-        last_valid_indices = valid_sampled_tokens_count - 1
-        last_valid_indices_safe = torch.clamp(last_valid_indices, min=0)
-
-        # Get last valid token from each row
-        # (assume undefined state where there is no valid token)
-        selected_tokens = torch.gather(
-            valid_sampled_token_ids_gpu, 1, last_valid_indices_safe.unsqueeze(1)
-        ).squeeze(1)
+        # Kernel grid: one program per request (row)
+        grid = (batch_size,)
 
-        # Use last token if valid, pre-computed backup if not
-        batch_size = valid_sampled_token_ids_gpu.shape[0]
-        next_token_ids = torch.where(
-            last_valid_indices != -1,
-            selected_tokens,
-            self.backup_next_token_ids.gpu[:batch_size],
+        # Find the next power of 2 for block sizes
+        BLOCK_SIZE_TOKENS = triton.next_power_of_2(num_tokens)
+        eagle_prepare_next_token_padded_kernel[grid](
+            sampled_token_ids,
+            discard_request_mask,
+            backup_tokens_gpu,
+            next_token_ids,
+            valid_sampled_tokens_count,
+            gpu_input_batch.vocab_size,
+            num_tokens,
+            batch_size,
+            sampled_token_ids.stride(0),
+            BLOCK_SIZE_TOKENS=BLOCK_SIZE_TOKENS,
         )
 
         return next_token_ids, valid_sampled_tokens_count
@@ -590,35 +622,35 @@ def prepare_inputs_padded(
         common_attn_metadata: CommonAttentionMetadata,
         spec_decode_metadata: SpecDecodeMetadata,
         valid_sampled_tokens_count: torch.Tensor,
-    ) -> tuple[CommonAttentionMetadata, torch.Tensor, torch.Tensor]:
+    ) -> tuple[CommonAttentionMetadata, torch.Tensor]:
         """
         This function is used to prepare the inputs for speculative decoding
         It updates the common_attn_metadata for speculative decoding,
         but does not consider the rejected tokens. Instead, all tokens
         are included as inputs to the speculator, with the rejected tokens
         used as padding and filtered out later by `token_indices_to_sample`.
-        No blocking CPU operations should be introduced in this function.
         """
-        num_draft_tokens_gpu = torch.cat(
-            [
-                spec_decode_metadata.cu_num_draft_tokens[0:1],
-                spec_decode_metadata.cu_num_draft_tokens[1:]
-                - spec_decode_metadata.cu_num_draft_tokens[:-1],
-            ]
+        num_reqs = common_attn_metadata.num_reqs
+        device = valid_sampled_tokens_count.device
+
+        token_indices_to_sample = torch.empty(
+            (num_reqs,), dtype=torch.int32, device=device
         )
 
-        num_rejected_tokens_gpu = torch.where(
-            num_draft_tokens_gpu > 0,
-            num_draft_tokens_gpu + 1 - valid_sampled_tokens_count,
-            torch.zeros_like(num_draft_tokens_gpu),
+        # Kernel grid: one program per request (row)
+        grid = (num_reqs,)
+        eagle_prepare_inputs_padded_kernel[grid](
+            spec_decode_metadata.cu_num_draft_tokens,
+            valid_sampled_tokens_count,
+            common_attn_metadata.query_start_loc,
+            token_indices_to_sample,
+            num_reqs,
         )
 
         query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
-
         new_query_len_per_req = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
 
         total_num_tokens = query_start_loc_cpu[-1].item()
-        token_indices = self.arange[:total_num_tokens]
 
         spec_common_attn_metadata = CommonAttentionMetadata(
             query_start_loc=common_attn_metadata.query_start_loc,
@@ -631,16 +663,12 @@ def prepare_inputs_padded(
             max_query_len=new_query_len_per_req.max().item(),
             max_seq_len=common_attn_metadata.seq_lens_cpu.max().item(),
             block_table_tensor=common_attn_metadata.block_table_tensor,
-            slot_mapping=common_attn_metadata.slot_mapping[token_indices],
+            slot_mapping=common_attn_metadata.slot_mapping[:total_num_tokens],
             causal=True,
             dcp_local_seq_lens=common_attn_metadata.dcp_local_seq_lens,
         )
 
-        token_indices_to_sample = (
-            common_attn_metadata.query_start_loc[1:] - 1 - num_rejected_tokens_gpu
-        )
-
-        return spec_common_attn_metadata, token_indices, token_indices_to_sample
+        return spec_common_attn_metadata, token_indices_to_sample
 
     def propose_tree(
         self,
@@ -765,7 +793,10 @@ def propose_tree(
             self.positions[:num_tokens] = tree_positions.view(-1)
             self.hidden_states[:num_tokens] = tree_hidden_states.view(num_tokens, -1)
 
-            if self.use_cuda_graph and num_tokens <= self.cudagraph_batch_sizes[-1]:
+            if (
+                self.use_cuda_graph
+                and num_tokens <= self.compilation_config.max_cudagraph_capture_size
+            ):
                 num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
                 cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
             else:
@@ -980,10 +1011,10 @@ def load_model(self, target_model: nn.Module) -> None:
 
         if supports_multimodal(target_model):
             # handle multimodality
-            if (
-                self.get_model_name(target_model)
-                == "Qwen2_5_VLForConditionalGeneration"
-            ):
+            if self.get_model_name(target_model) in [
+                "Qwen2_5_VLForConditionalGeneration",
+                "Qwen3VLForConditionalGeneration",
+            ]:
                 self.model.config.image_token_index = target_model.config.image_token_id
             else:
                 self.model.config.image_token_index = (
@@ -992,6 +1023,7 @@ def load_model(self, target_model: nn.Module) -> None:
             target_language_model = target_model.get_language_model()
         else:
             target_language_model = target_model
+
         # share embed_tokens with the target model if needed
         if get_pp_group().world_size == 1:
             if hasattr(target_language_model.model, "embed_tokens"):
@@ -1003,85 +1035,154 @@ def load_model(self, target_model: nn.Module) -> None:
                     "Target model does not have 'embed_tokens' or 'embedding' attribute"
                 )
 
-            # Check if shapes match and we found the embedding
-            eagle_shape = self.model.model.embed_tokens.weight.shape
-            target_shape = target_embed_tokens.weight.shape
-            if eagle_shape == target_shape:
-                logger.info(
-                    "Assuming the EAGLE head shares the same vocab embedding"
-                    " with the target model."
-                )
-                del self.model.model.embed_tokens
-                self.model.model.embed_tokens = target_embed_tokens
+            share_embeddings = False
+            if hasattr(self.model, "has_own_embed_tokens"):
+                # EAGLE model
+                if not self.model.has_own_embed_tokens:
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model without its own embed_tokens in the"
+                        " checkpoint. Sharing target model embedding weights with the"
+                        " draft model."
+                    )
+                elif (
+                    isinstance(target_embed_tokens.weight, torch.Tensor)
+                    and isinstance(self.model.model.embed_tokens.weight, torch.Tensor)
+                    # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                    # usage in CI testing environments with limited GPU memory
+                    and torch.equal(
+                        target_embed_tokens.weight.cpu(),
+                        self.model.model.embed_tokens.weight.cpu(),
+                    )
+                ):
+                    share_embeddings = True
+                    logger.info(
+                        "Detected EAGLE model with embed_tokens identical to the target"
+                        " model. Sharing target model embedding weights with the draft"
+                        " model."
+                    )
+                else:
+                    logger.info(
+                        "Detected EAGLE model with distinct embed_tokens weights. "
+                        "Keeping separate embedding weights from the target model."
+                    )
             else:
+                # MTP model
+                share_embeddings = True
                 logger.info(
-                    "The EAGLE head's vocab embedding will be loaded separately"
-                    " from the target model."
+                    "Detected MTP model. "
+                    "Sharing target model embedding weights with the draft model."
                 )
+
+            if share_embeddings:
+                if hasattr(self.model.model, "embed_tokens"):
+                    del self.model.model.embed_tokens
+                self.model.model.embed_tokens = target_embed_tokens
         else:
             logger.info(
-                "The EAGLE head's vocab embedding will be loaded separately"
+                "The draft model's vocab embedding will be loaded separately"
                 " from the target model."
             )
 
         # share lm_head with the target model if needed
-        # some model definition do not define lm_head explicitly
-        # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM
-        if self.vllm_config.speculative_config.method != "eagle3":
-            if hasattr(target_language_model, "lm_head"):
-                logger.info("Loading EAGLE LM head weights from the target model.")
-                self.model.lm_head = target_language_model.lm_head
-        else:
-            if (
-                hasattr(self.model, "lm_head")
-                and hasattr(target_language_model, "lm_head")
-                and self.model.lm_head.weight.shape
-                == target_language_model.lm_head.weight.shape
+        share_lm_head = False
+        if hasattr(self.model, "has_own_lm_head"):
+            # EAGLE model
+            if not self.model.has_own_lm_head:
+                share_lm_head = True
+                logger.info(
+                    "Detected EAGLE model without its own lm_head in the checkpoint. "
+                    "Sharing target model lm_head weights with the draft model."
+                )
+            elif (
+                hasattr(target_language_model, "lm_head")
+                and isinstance(target_language_model.lm_head.weight, torch.Tensor)
+                and isinstance(self.model.lm_head.weight, torch.Tensor)
+                # TODO: Offload to CPU for comparison to avoid extra GPU memory
+                # usage in CI testing environments with limited GPU memory
+                and torch.equal(
+                    target_language_model.lm_head.weight.cpu(),
+                    self.model.lm_head.weight.cpu(),
+                )
             ):
+                share_lm_head = True
                 logger.info(
-                    "Assuming the EAGLE head shares the same lm_head"
-                    " with the target model."
+                    "Detected EAGLE model with lm_head identical to the target model. "
+                    "Sharing target model lm_head weights with the draft model."
                 )
-                del self.model.lm_head
-                self.model.lm_head = target_language_model.lm_head
             else:
                 logger.info(
-                    "The EAGLE head's lm_head will be loaded separately"
-                    " from the target model."
+                    "Detected EAGLE model with distinct lm_head weights. "
+                    "Keeping separate lm_head weights from the target model."
                 )
+        else:
+            # MTP model
+            share_lm_head = True
+            logger.info(
+                "Detected MTP model. "
+                "Sharing target model lm_head weights with the draft model."
+            )
+
+        if share_lm_head and hasattr(target_language_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_language_model.lm_head
 
     @torch.inference_mode()
     def dummy_run(
         self,
         num_tokens: int,
         use_cudagraphs=True,
+        is_graph_capturing=False,
     ) -> None:
         # Determine if CUDA graphs should be used for this run.
         cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph
-        if cudagraphs_enabled and num_tokens <= self.cudagraph_batch_sizes[-1]:
-            num_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
 
-        with set_forward_context(
-            None,
-            self.vllm_config,
-            num_tokens=num_tokens,
-            cudagraph_runtime_mode=(
-                CUDAGraphMode.PIECEWISE if cudagraphs_enabled else CUDAGraphMode.NONE
-            ),
+        # FIXME: when using tree-based specdec, adjust number of forward-passes
+        # according to the depth of the tree.
+        for fwd_idx in range(
+            self.num_speculative_tokens if not is_graph_capturing else 1
         ):
-            if self.supports_mm_inputs:
-                input_ids = None
-                inputs_embeds = self.inputs_embeds[:num_tokens]
-            else:
-                input_ids = self.input_ids[:num_tokens]
-                inputs_embeds = None
+            if fwd_idx <= 1:
+                num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp(
+                    num_tokens_unpadded=num_tokens,
+                    num_tokens_padded=num_tokens,
+                )
+                if (
+                    cudagraphs_enabled
+                    and num_tokens_dp_padded
+                    <= self.compilation_config.max_cudagraph_capture_size
+                ):
+                    num_input_tokens = self.vllm_config.pad_for_cudagraph(
+                        num_tokens_dp_padded
+                    )
+                else:
+                    num_input_tokens = num_tokens_dp_padded
+                if num_tokens_across_dp is not None:
+                    num_tokens_across_dp[self.dp_rank] = num_input_tokens
 
-            self.model(
-                input_ids=input_ids,
-                positions=self._get_positions(num_tokens),
-                hidden_states=self.hidden_states[:num_tokens],
-                inputs_embeds=inputs_embeds,
-            )
+            with set_forward_context(
+                None,
+                self.vllm_config,
+                num_tokens=num_input_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+                cudagraph_runtime_mode=CUDAGraphMode.PIECEWISE
+                if cudagraphs_enabled
+                else CUDAGraphMode.NONE,
+            ):
+                if self.supports_mm_inputs:
+                    input_ids = None
+                    inputs_embeds = self.inputs_embeds[:num_input_tokens]
+                else:
+                    input_ids = self.input_ids[:num_input_tokens]
+                    inputs_embeds = None
+
+                self.model(
+                    input_ids=input_ids,
+                    positions=self._get_positions(num_input_tokens),
+                    hidden_states=self.hidden_states[:num_input_tokens],
+                    inputs_embeds=inputs_embeds,
+                )
 
     def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
         """Find and return the attention metadata builders for EAGLE layers.
@@ -1108,6 +1209,22 @@ def _get_attention_metadata_builder(self) -> AttentionMetadataBuilder:
         )
         return builder
 
+    def _get_eagle3_use_aux_hidden_state_from_config(self) -> bool:
+        """
+        Some eagle3 heads (e.g., nvidia/gpt-oss-120b-Eagle3-v2) do not use auxiliary
+        hidden states and directly uses the last layer output just like eagle1.
+        They might indicate this by setting "use_aux_hidden_state" to False
+        inside the "eagle_config" dict of their hf_config.
+        """
+        if self.method != "eagle3":
+            return False
+        # Assume that eagle3 heads use aux hidden states by default
+        use_aux_hidden_state = True
+        eagle_config = getattr(self.draft_model_config.hf_config, "eagle_config", None)
+        if eagle_config is not None:
+            use_aux_hidden_state = eagle_config.get("use_aux_hidden_state", True)
+        return use_aux_hidden_state
+
     def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
         """
         Validate that all eagle layers belong to the same KVCacheGroup.
@@ -1131,6 +1248,28 @@ def validate_same_kv_cache_group(self, kv_cache_config: KVCacheConfig) -> None:
             == 1
         ), "All eagle layers should belong to the same kv cache group"
 
+    def _pad_batch_across_dp(
+        self,
+        num_tokens_unpadded: int,
+        num_tokens_padded: int,
+    ) -> tuple[int, torch.Tensor]:
+        # TODO(Flechman): support DBO ubatching
+        ubatch_slices, num_toks_across_dp = coordinate_batch_across_dp(
+            num_tokens_unpadded=num_tokens_unpadded,
+            parallel_config=self.vllm_config.parallel_config,
+            allow_microbatching=False,
+            allow_dp_padding=self.use_cuda_graph,
+            num_tokens_padded=num_tokens_padded,
+            uniform_decode=None,
+            num_scheduled_tokens_per_request=None,
+        )
+        assert ubatch_slices is None, "DBO ubatching not implemented for EAGLE"
+
+        num_tokens_dp_padded = num_tokens_padded
+        if num_toks_across_dp is not None:
+            num_tokens_dp_padded = int(num_toks_across_dp[self.dp_rank].item())
+        return num_tokens_dp_padded, num_toks_across_dp
+
 
 # NOTE(woosuk): Currently, the below code is not used and we always use argmax
 # to sample the draft tokens. We will use this after we find a way to manage
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 79d856a143ba..6c16bc686d16 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -144,7 +144,7 @@ def __init__(
         self,
         speculative_config: SpeculativeConfig | None,
         labelnames: list[str],
-        per_engine_labelvalues: dict[int, list[str]],
+        per_engine_labelvalues: dict[int, list[object]],
     ):
         self.spec_decoding_enabled = speculative_config is not None
         if not self.spec_decoding_enabled:
@@ -215,7 +215,8 @@ def observe(self, spec_decoding_stats: SpecDecodingStats, engine_idx: int = 0):
 
 
 def make_per_engine(
-    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[str]]
+    counter: prometheus_client.Counter,
+    per_engine_labelvalues: dict[int, list[object]],
 ):
     """Create a counter for each label value."""
     return {
diff --git a/vllm/v1/spec_decode/ngram_proposer.py b/vllm/v1/spec_decode/ngram_proposer.py
index e2f83cb24aa9..10b3f0aa040e 100644
--- a/vllm/v1/spec_decode/ngram_proposer.py
+++ b/vllm/v1/spec_decode/ngram_proposer.py
@@ -269,7 +269,7 @@ def _find_longest_matched_ngram_and_propose_tokens(
                 prev_lps = lps[max_ngram - 1]
             i += 1
         elif prev_lps != 0:
-            # Token mismatch: try the second longest prefix
+            # Token mismatch: try the second-longest prefix
             # among all suffix of tokens[:i],
             # which is the longest prefix of tokens[:prev_lps]
             prev_lps = lps[prev_lps - 1]
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index 1901c6fc9f14..9d4399d00487 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from vllm.sampling_params import SamplingParams
+from vllm.triton_utils import tl, triton
 
 _SAMPLING_EPS = 1e-5
 
@@ -14,3 +15,107 @@ def is_spec_decode_unsupported(sampling_params: SamplingParams) -> bool:
         or sampling_params.min_p > _SAMPLING_EPS
         or sampling_params.logprobs is not None
     )
+
+
+@triton.jit
+def eagle_prepare_inputs_padded_kernel(
+    cu_num_draft_tokens_ptr,  # [num_reqs]
+    valid_sampled_tokens_count_ptr,  # [num_reqs]
+    query_start_loc_gpu_ptr,  # [num_reqs + 1]
+    token_indices_to_sample_ptr,  # [num_reqs] (output)
+    num_reqs,  # tl.int32
+):
+    """
+    Fused kernel for Eagle prepare_input_padded. This kernel computes the
+    token index to sample for each request, taking into account the number
+    of draft tokens and the number of valid sampled tokens (which is one more than
+    the number of accepted tokens).
+    """
+    req_idx = tl.program_id(axis=0)
+    if req_idx >= num_reqs:
+        return
+
+    # Calculate num_draft_tokens from cu_num_draft_tokens, which is an inclusive
+    # cumulative sum (first entry is the first value, not zero).
+    cu_draft_curr = tl.load(cu_num_draft_tokens_ptr + req_idx)
+
+    num_draft_tokens = 0
+    if req_idx == 0:
+        num_draft_tokens = cu_draft_curr
+    else:
+        cu_draft_prev = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+        num_draft_tokens = cu_draft_curr - cu_draft_prev
+
+    valid_count = tl.load(valid_sampled_tokens_count_ptr + req_idx)
+    num_rejected_tokens = num_draft_tokens + 1 - valid_count
+    num_rejected_tokens = tl.where(num_draft_tokens > 0, num_rejected_tokens, 0)
+
+    # query_start_loc[req_idx + 1] is the start position of the next request,
+    # which is one past the last token of this request.
+    q_last_tok_idx = tl.load(query_start_loc_gpu_ptr + req_idx + 1) - 1
+
+    index_to_sample = q_last_tok_idx - num_rejected_tokens
+    tl.store(token_indices_to_sample_ptr + req_idx, index_to_sample)
+
+
+@triton.jit
+def eagle_prepare_next_token_padded_kernel(
+    sampled_token_ids_ptr,  # [num_reqs, num_sampled_tokens_per_req]
+    discard_request_mask_ptr,  # [num_reqs]
+    backup_next_token_ids_ptr,  # [num_reqs]
+    next_token_ids_ptr,  # [num_reqs] (output)
+    valid_sampled_tokens_count_ptr,  # [num_reqs] (output)
+    vocab_size,  # tl.int32
+    num_sampled_tokens_per_req,  # tl.int32 (num_spec_tokens + 1)
+    num_reqs,  # tl.int32
+    stride_sampled_token_ids,  # tl.int32 (stride for dim 0)
+    BLOCK_SIZE_TOKENS: tl.constexpr,  # Power-of-2 >= num_sampled_tokens_per_req
+):
+    """
+    Fused kernel for Eagle prepare_next_token_ids_padded. This kernel computes the
+    number of valid (1 + accepted) tokens for each request, and the corresponding
+    "next" token id to sample from during speculative decoding. This is the
+    "last accepted token" from the sampled tokens, or the backup token if no
+    tokens were accepted or if the request is marked as discarded.
+    """
+    req_idx = tl.program_id(axis=0)
+    if req_idx >= num_reqs:
+        return
+
+    # Check if this request is discarded.
+    is_discarded = tl.load(discard_request_mask_ptr + req_idx)
+
+    if is_discarded:
+        backup_token = tl.load(backup_next_token_ids_ptr + req_idx)
+        valid_count = tl.full((), 0, dtype=tl.uint32)
+        tl.store(next_token_ids_ptr + req_idx, backup_token)
+        tl.store(valid_sampled_tokens_count_ptr + req_idx, valid_count)
+    else:
+        # Count the number of valid tokens among the sampled tokens.
+        token_offs = tl.arange(0, BLOCK_SIZE_TOKENS)
+        token_mask = token_offs < num_sampled_tokens_per_req
+
+        row_ptr = sampled_token_ids_ptr + req_idx * stride_sampled_token_ids
+        token_ids = tl.load(row_ptr + token_offs, mask=token_mask, other=-1)
+
+        # Rejected tokens are -1, valid tokens are in [0, vocab_size)
+        is_valid_mask = (token_ids != -1) & (token_ids < vocab_size) & token_mask
+        valid_count = tl.sum(is_valid_mask)
+
+        if valid_count > 0:
+            # Guaranteed to be well-defined since
+            # valid_count > 0 implies is_valid_mask is not empty
+            last_valid_index = tl.max(tl.where(is_valid_mask, token_offs, -1))
+
+            # Select the token at that index, using a sum trick since
+            # we don't want to load again to access token_ids[last_valid_index].
+            last_valid_token = tl.sum(
+                tl.where(token_offs == last_valid_index, token_ids, 0)
+            )
+            tl.store(next_token_ids_ptr + req_idx, last_valid_token)
+        else:
+            # No valid tokens found, use backup token
+            backup_token = tl.load(backup_next_token_ids_ptr + req_idx)
+            tl.store(next_token_ids_ptr + req_idx, backup_token)
+
+        tl.store(valid_sampled_tokens_count_ptr + req_idx, valid_count)
diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py
index 7dc9589b63b8..5c09b7b0634f 100644
--- a/vllm/v1/structured_output/backend_types.py
+++ b/vllm/v1/structured_output/backend_types.py
@@ -10,10 +10,10 @@
     import torch
 
     from vllm.config import VllmConfig
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
 else:
     VllmConfig = object
-    AnyTokenizer = object
+    TokenizerLike = object
 
 
 class StructuredOutputOptions(enum.Enum):
@@ -100,7 +100,7 @@ class StructuredOutputBackend(ABC):
     """Engine-level backend for structured output requests."""
 
     vllm_config: VllmConfig
-    tokenizer: AnyTokenizer
+    tokenizer: TokenizerLike
     vocab_size: int
 
     @abstractmethod
diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py
index c9f2dc07da78..f8a2df43dd90 100644
--- a/vllm/v1/structured_output/backend_xgrammar.py
+++ b/vllm/v1/structured_output/backend_xgrammar.py
@@ -10,7 +10,7 @@
 import vllm.envs
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
+from vllm.tokenizers import MistralTokenizer
 from vllm.utils.import_utils import LazyLoader
 from vllm.v1.structured_output.backend_types import (
     StructuredOutputBackend,
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index d2d14fcfc436..ae42b33f80f8 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -24,7 +24,7 @@
     import transformers.models.gpt2.tokenization_gpt2 as tokenization_gpt2
     import xgrammar as xgr
 
-    from vllm.transformers_utils.tokenizer import AnyTokenizer
+    from vllm.tokenizers import TokenizerLike
     from vllm.v1.worker.gpu_input_batch import InputBatch
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
@@ -36,7 +36,7 @@
         "transformers.models.gpt2.tokenization_gpt2",
     )
 
-    AnyTokenizer = object
+    TokenizerLike = object
     SchedulerOutput = object
     InputBatch = object
 
@@ -195,7 +195,7 @@ def get_outlines_cache():
 
 
 def _reduced_vocabulary(
-    tokenizer: AnyTokenizer,
+    tokenizer: TokenizerLike,
     eos_token_id: int,
 ) -> dict[bytes, list[int]]:
     """Create a map from vocabulary tokens to lists of equivalent token ids.
@@ -222,7 +222,7 @@ def convert_token_to_string(token: str) -> str:
     vocabulary: dict[bytes, list[int]] = {}
     empty_token_ids: list[int] = []
     for token, token_idx in tokenizer.get_vocab().items():
-        if token in tokenizer.all_special_tokens:  # type: ignore
+        if token in tokenizer.all_special_tokens:
             continue
 
         token_str = convert_token_to_string(token)
@@ -261,7 +261,7 @@ def convert_token_to_string(token: str) -> str:
     return vocabulary
 
 
-def get_outlines_vocabulary(tokenizer: AnyTokenizer) -> oc.Vocabulary:
+def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary:
     """Get the `Vocabulary` object for a given tokenizer."""
     if hasattr(tokenizer, "_outlines_vocabulary"):
         return tokenizer._outlines_vocabulary  # type: ignore
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index a401f6d74cdd..29099d1e9b17 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -97,6 +97,9 @@ def __len__(self):
     def __repr__(self):
         return f"ConstantList({self._x})"
 
+    def copy(self) -> list[T]:
+        return self._x.copy()
+
 
 class CpuGpuBuffer:
     """Buffer to easily copy tensors between CPU and GPU."""
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index c28bf542f85c..37ec0fb97e06 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -4,7 +4,7 @@
 import numpy as np
 import torch
 
-from vllm.distributed import get_dcp_group
+from vllm.distributed import get_dcp_group, get_pcp_group
 from vllm.logger import init_logger
 from vllm.utils.math_utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer
@@ -22,7 +22,7 @@ def __init__(
         pin_memory: bool,
         device: torch.device,
         kernel_block_size: int,
-        dcp_kv_cache_interleave_size: int,
+        cp_kv_cache_interleave_size: int,
     ):
         """
         Args:
@@ -80,6 +80,13 @@ def __init__(
         else:
             self._kernel_block_arange = None
 
+        try:
+            self.pcp_world_size = get_pcp_group().world_size
+            self.pcp_rank = get_pcp_group().rank_in_group
+        except AssertionError:
+            # PCP might not be initialized in testing
+            self.pcp_world_size = 1
+            self.pcp_rank = 0
         try:
             self.dcp_world_size = get_dcp_group().world_size
             self.dcp_rank = get_dcp_group().rank_in_group
@@ -87,7 +94,7 @@ def __init__(
             # DCP might not be initialized in testing
             self.dcp_world_size = 1
             self.dcp_rank = 0
-        self.dcp_kv_cache_interleave_size = dcp_kv_cache_interleave_size
+        self.cp_kv_cache_interleave_size = cp_kv_cache_interleave_size
 
     def append_row(
         self,
@@ -98,7 +105,9 @@ def append_row(
             return
 
         if self.use_hybrid_blocks:
-            block_ids = self._map_to_kernel_blocks(np.array(block_ids))
+            block_ids = self.map_to_kernel_blocks(
+                np.array(block_ids), self.blocks_per_kv_block, self._kernel_block_arange
+            )
 
         num_blocks = len(block_ids)
         start = self.num_blocks_per_row[row_idx]
@@ -129,14 +138,16 @@ def compute_slot_mapping(
         # NOTE(woosuk): We can't simply use `token_indices // block_size`
         # here because M (max_model_len) is not necessarily divisible by
         # block_size.
-        if self.dcp_world_size > 1:
+        total_cp_world_size = self.pcp_world_size * self.dcp_world_size
+        total_cp_rank = self.pcp_rank * self.dcp_world_size + self.dcp_rank
+        if total_cp_world_size > 1:
             # Note(hc): The DCP implement store kvcache with an interleave
             # style, the kvcache for the token whose token_idx is i is
             # always stored on the GPU whose dcp_rank equals i % cp_world_size:
 
             # Use a "virtual block" which equals to world_size * block_size
             # for block_table_indices calculation.
-            virtual_block_size = self.block_size * self.dcp_world_size
+            virtual_block_size = self.block_size * total_cp_world_size
             block_table_indices = (
                 req_indices * self.max_num_blocks_per_req
                 + positions // virtual_block_size
@@ -148,16 +159,16 @@ def compute_slot_mapping(
             virtual_block_offsets = positions % virtual_block_size
             mask = (
                 virtual_block_offsets
-                // self.dcp_kv_cache_interleave_size
-                % self.dcp_world_size
-                == self.dcp_rank
+                // self.cp_kv_cache_interleave_size
+                % total_cp_world_size
+                == total_cp_rank
             )
             # Calculate local block_offsets
             block_offsets = (
                 virtual_block_offsets
-                // (self.dcp_world_size * self.dcp_kv_cache_interleave_size)
-                * self.dcp_kv_cache_interleave_size
-                + virtual_block_offsets % self.dcp_kv_cache_interleave_size
+                // (total_cp_world_size * self.cp_kv_cache_interleave_size)
+                * self.cp_kv_cache_interleave_size
+                + virtual_block_offsets % self.cp_kv_cache_interleave_size
             )
             # Calculate slot_mapping
             slot_mapping = block_numbers * self.block_size + block_offsets
@@ -188,7 +199,12 @@ def clear(self) -> None:
         self.block_table.gpu.fill_(0)
         self.block_table.cpu.fill_(0)
 
-    def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def map_to_kernel_blocks(
+        kv_manager_block_ids: np.ndarray,
+        blocks_per_kv_block: int,
+        kernel_block_arange: np.ndarray,
+    ) -> np.ndarray:
         """Convert kv_manager_block_id IDs to kernel block IDs.
 
         Example:
@@ -203,12 +219,12 @@ def _map_to_kernel_blocks(self, kv_manager_block_ids: np.ndarray) -> np.ndarray:
             # kv_manager_block_id 1 → kernel block id [2, 3]
             # kv_manager_block_id 2 → kernel block id [4, 5]
         """
-        if not self.use_hybrid_blocks:
+        if blocks_per_kv_block == 1:
             return kv_manager_block_ids
 
         kernel_block_ids = (
-            kv_manager_block_ids.reshape(-1, 1) * self.blocks_per_kv_block
-            + self._kernel_block_arange
+            kv_manager_block_ids.reshape(-1, 1) * blocks_per_kv_block
+            + kernel_block_arange
         )
 
         return kernel_block_ids.reshape(-1)
@@ -246,12 +262,17 @@ def __init__(
         block_sizes: list[int],
         kernel_block_sizes: list[int],
         num_speculative_tokens: int = 0,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
     ) -> None:
         # Note(hc): each dcp rank only store
         # (max_model_len//dcp_world_size) tokens in kvcache,
         # so the block_size which used for calc max_num_blocks_per_req
         # must be multiplied by dcp_world_size.
+        try:
+            pcp_world_size = get_pcp_group().world_size
+        except AssertionError:
+            # PCP might not be initialized in testing
+            pcp_world_size = 1
         try:
             dcp_world_size = get_dcp_group().world_size
         except AssertionError:
@@ -264,19 +285,21 @@ def __init__(
                 f"must match block_sizes length ({len(block_sizes)})"
             )
 
+        total_cp_world_size = dcp_world_size * pcp_world_size
+
         self.block_tables = [
             BlockTable(
                 block_size,
                 max_num_reqs,
                 max(
-                    cdiv(max_model_len, block_size * dcp_world_size),
+                    cdiv(max_model_len, block_size * total_cp_world_size),
                     1 + num_speculative_tokens,
                 ),
                 max_num_batched_tokens,
                 pin_memory,
                 device,
                 kernel_block_size,
-                dcp_kv_cache_interleave_size,
+                cp_kv_cache_interleave_size,
             )
             for block_size, kernel_block_size in zip(block_sizes, kernel_block_sizes)
         ]
diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py
index ceb1cf64b588..6bfbc32d598f 100644
--- a/vllm/v1/worker/cpu_model_runner.py
+++ b/vllm/v1/worker/cpu_model_runner.py
@@ -80,9 +80,6 @@ def _init_device_properties(self) -> None:
     def _sync_device(self) -> None:
         pass
 
-    def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
-        return sampled_token_ids.tolist()
-
     def get_dp_padding(self, num_tokens: int) -> tuple[int, torch.Tensor | None]:
         # Note: For CPU backend, dp padding is not required for now.
         return 0, None
@@ -99,14 +96,14 @@ class _StreamPlaceholder:
         def __init__(self, *args, **kwargs) -> None:
             pass
 
-    cuda_event = torch.cuda.Event
+    cuda_event = torch.Event
     cuda_stream = torch.cuda.Stream
     try:
-        torch.cuda.Event = _EventPlaceholder
+        torch.Event = _EventPlaceholder
         torch.cuda.Stream = _StreamPlaceholder
         yield
     finally:
-        torch.cuda.Event = cuda_event
+        torch.Event = cuda_event
         torch.cuda.Stream = cuda_stream
 
 
diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py
index 4420a057d1e5..b080fea1d2dd 100644
--- a/vllm/v1/worker/cpu_worker.py
+++ b/vllm/v1/worker/cpu_worker.py
@@ -3,6 +3,7 @@
 import os
 import platform
 from collections.abc import Callable
+from typing import Any
 
 import torch
 
@@ -37,6 +38,9 @@ def __init__(
 
         self.parallel_config.disable_custom_all_reduce = True
 
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
@@ -80,13 +84,13 @@ def init_device(self):
             self.local_omp_cpuid = "nobind"
         else:
             local_dp_rank = self.parallel_config.data_parallel_rank_local
-            omp_cpuids = omp_cpuids.split("|")
+            omp_cpuids_list = omp_cpuids.split("|")
             if local_dp_rank is not None:
                 world_size = self.parallel_config.world_size
-                omp_cpuids = omp_cpuids[
+                omp_cpuids_list = omp_cpuids_list[
                     local_dp_rank * world_size : (local_dp_rank + 1) * world_size
                 ]
-            self.local_omp_cpuid = omp_cpuids[self.rank]
+            self.local_omp_cpuid = omp_cpuids_list[self.rank]
 
         if self.local_omp_cpuid != "nobind":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
@@ -120,7 +124,7 @@ def wake_up(self, tags: list[str] | None = None) -> None:
         pass
 
     def determine_available_memory(self) -> int:
-        return self.cache_config.cpu_kvcache_space_bytes  # type: ignore
+        return self.cache_config.cpu_kvcache_space_bytes or 0
 
     def compile_or_warm_up_model(self) -> None:
         # Reset the seed to ensure that the random state is not affected by
diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py
index 464fbf11a21a..6539d72d81cb 100644
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@@ -9,6 +9,7 @@
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.v1.worker.ubatch_utils import (
+    UBatchSlice,
     UBatchSlices,
     check_ubatch_thresholds,
     create_ubatch_slices,
@@ -23,12 +24,14 @@ def _get_device_and_group(parallel_config: ParallelConfig):
     device = get_dp_group().device
     group = get_dp_group().device_group
 
-    # Transfering this tensor from GPU to CPU will introduce a GPU sync
+    # Transferring this tensor from GPU to CPU will introduce a GPU sync
     # point that could adversely affect performance of vllm with asynch
     # scheduling. This environment variable exists to quickly disable
     # this optimization if we run into this case.
     if parallel_config.disable_nccl_for_dp_synchronization:
-        logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
+        logger.info_once(
+            "Using CPU all reduce to synchronize DP padding between ranks."
+        )
         device = "cpu"
         group = get_dp_group().cpu_group
     return device, group
@@ -88,6 +91,20 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch
         return num_tokens_across_dp.cpu()
 
 
+# This just pads the second ubatch slice out to the total number of tokens
+# (num_tokens + padding) since we do `create_ubatch_slices` before applying DP padding.
+def _pad_out_ubatch_slice(
+    ubatch_slices: UBatchSlices, num_total_tokens: int
+) -> UBatchSlices:
+    padded_second_token_slice = slice(
+        ubatch_slices[1].token_slice.start, num_total_tokens
+    )
+    ubatch_slices[1] = UBatchSlice(
+        ubatch_slices[1].request_slice, padded_second_token_slice
+    )
+    return ubatch_slices
+
+
 def _synchronize_dp_ranks(
     num_tokens_unpadded: int,
     num_tokens_padded: int,
@@ -220,11 +237,14 @@ def coordinate_batch_across_dp(
     # to the second ubatch in pad_out_ubatch_slice after attention
     # metadata creation
     assert num_tokens_after_padding is not None
-    token_split_point = int(num_tokens_after_padding[0].item()) // 2
+    num_tokens_padded = int(num_tokens_after_padding[0].item())
+    token_split_point = int(num_tokens_padded) // 2
 
     assert num_scheduled_tokens_per_request is not None
     ubatch_slices = create_ubatch_slices(
         num_scheduled_tokens_per_request, token_split_point
     )
+    ubatch_slices = _pad_out_ubatch_slice(ubatch_slices, num_tokens_padded)
+    assert sum(s.num_tokens for s in ubatch_slices) == num_tokens_padded
 
     return (ubatch_slices, num_tokens_after_padding)
diff --git a/vllm/v1/worker/ec_connector_model_runner_mixin.py b/vllm/v1/worker/ec_connector_model_runner_mixin.py
index 00bc909df297..08a41532ea8e 100644
--- a/vllm/v1/worker/ec_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/ec_connector_model_runner_mixin.py
@@ -59,7 +59,7 @@ def maybe_get_ec_connector_output(
         )
 
     # This context manager must be used within an active forward context.
-    # It encapsulates the entire EC conector lifecycle within execute_model
+    # It encapsulates the entire EC connector lifecycle within execute_model
     @staticmethod
     @contextmanager
     def _get_ec_connector_output(
diff --git a/vllm/v1/worker/gpu/README.md b/vllm/v1/worker/gpu/README.md
new file mode 100644
index 000000000000..093f524b3250
--- /dev/null
+++ b/vllm/v1/worker/gpu/README.md
@@ -0,0 +1,4 @@
+# [Experimental] Model Runner V2
+
+This directory contains the new model runner which is under active development.
+Ping [Woosuk Kwon](https://github.com/WoosukKwon) for any changes.
diff --git a/vllm/v1/worker/gpu/__init__.py b/vllm/v1/worker/gpu/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py
new file mode 100644
index 000000000000..f6bc607c1ae6
--- /dev/null
+++ b/vllm/v1/worker/gpu/async_utils.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from contextlib import contextmanager
+
+import torch
+
+from vllm.v1.outputs import (
+    AsyncModelRunnerOutput,
+    LogprobsTensors,
+    ModelRunnerOutput,
+    SamplerOutput,
+)
+
+
+class AsyncOutput(AsyncModelRunnerOutput):
+    def __init__(
+        self,
+        model_runner_output: ModelRunnerOutput,
+        sampler_output: SamplerOutput,
+        num_sampled_tokens: torch.Tensor,
+        copy_stream: torch.cuda.Stream,
+        copy_event: torch.cuda.Event,
+    ):
+        # NOTE(woosuk): We must retain references to the GPU tensors,
+        # as the copy operations are performed on a different CUDA stream than
+        # the one where the tensors were created.
+        self.model_runner_output = model_runner_output
+        self.sampler_output = sampler_output
+        self.num_sampled_tokens = num_sampled_tokens
+        self.copy_stream = copy_stream
+        self.copy_event = copy_event
+
+        default_stream = torch.cuda.current_stream()
+        with torch.cuda.stream(self.copy_stream):
+            self.copy_stream.wait_stream(default_stream)
+
+            # NOTE(woosuk): We must ensure that CPU tensors are not freed
+            # before the device-to-host copy is fully completed. For instance,
+            # operations like
+            # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy()
+            # are unsafe because the underlying CPU tensor can be prematurely freed and
+            # reused by other tensors before the asynchronous copy finishes, potentially
+            # causing race conditions. To prevent this, we delay freeing by holding
+            # references until the copy event signals completion.
+            # Likewise, we also need to keep the reference to the GPU tensors.
+            # This is done by keeping the reference to sampler_output and
+            # model_runner_output.
+            self.sampled_token_ids = sampler_output.sampled_token_ids.to(
+                "cpu", non_blocking=True
+            )
+            if sampler_output.logprobs_tensors is not None:
+                self.logprobs_tensors: LogprobsTensors | None = (
+                    sampler_output.logprobs_tensors.to_cpu_nonblocking()
+                )
+            else:
+                self.logprobs_tensors = None
+            self.num_sampled_tokens_cpu = num_sampled_tokens.to(
+                "cpu", non_blocking=True
+            )
+            self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {}
+            if self.model_runner_output.prompt_logprobs_dict:
+                for k, v in self.model_runner_output.prompt_logprobs_dict.items():
+                    if v is not None:
+                        self.prompt_logprobs_dict[k] = v.to_cpu_nonblocking()
+                    else:
+                        self.prompt_logprobs_dict[k] = None
+            self.copy_event.record(self.copy_stream)
+
+    def get_output(self) -> ModelRunnerOutput:
+        self.copy_event.synchronize()
+        num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy()
+
+        # NOTE(woosuk): The following code is to ensure compatibility with
+        # the existing model runner.
+        # Going forward, we should keep the data structures as NumPy arrays
+        # rather than Python lists.
+        sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist()
+        num_reqs = len(sampled_token_ids)
+        for i in range(num_reqs):
+            del sampled_token_ids[i][num_sampled_tokens_np[i] :]
+        self.model_runner_output.sampled_token_ids = sampled_token_ids
+
+        if self.logprobs_tensors is not None:
+            self.model_runner_output.logprobs = self.logprobs_tensors.tolists()
+        self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict
+        return self.model_runner_output
+
+
+@contextmanager
+def async_barrier(event: torch.cuda.Event | None):
+    if event is not None:
+        event.synchronize()
+    try:
+        yield
+    finally:
+        if event is not None:
+            event.record()
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
new file mode 100644
index 000000000000..5aa1a33d851c
--- /dev/null
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -0,0 +1,189 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Sequence
+from typing import Any, cast
+
+import numpy as np
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
+from vllm.config import VllmConfig, get_layers_from_vllm_config
+from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
+from vllm.v1.attention.backends.utils import (
+    AttentionMetadataBuilder,
+    CommonAttentionMetadata,
+)
+from vllm.v1.kv_cache_interface import (
+    AttentionSpec,
+    KVCacheConfig,
+    KVCacheSpec,
+)
+from vllm.v1.worker.utils import bind_kv_cache
+
+
+def get_kv_cache_spec(vllm_config: VllmConfig) -> dict[str, KVCacheSpec]:
+    kv_cache_spec: dict[str, KVCacheSpec] = {}
+    layer_type = cast(type[Any], AttentionLayerBase)
+    attn_layers = get_layers_from_vllm_config(vllm_config, layer_type)
+    for layer_name, attn_module in attn_layers.items():
+        # Skip modules that don't need KV cache (eg encoder-only attention)
+        if spec := attn_module.get_kv_cache_spec(vllm_config):
+            kv_cache_spec[layer_name] = spec
+    return kv_cache_spec
+
+
+def init_attn_backend(
+    kv_cache_config: KVCacheConfig,
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    attn_backends: dict[str, type[AttentionBackend]] = {}
+    attn_metadata_builders: list[AttentionMetadataBuilder] = []
+    flashinfer_workspace: torch.Tensor | None = None
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        layer_names = kv_cache_group_spec.layer_names
+        any_layer_name = next(iter(layer_names))
+
+        layer_type = cast(type[Any], AttentionLayerBase)
+        attn_layers = get_layers_from_vllm_config(vllm_config, layer_type, layer_names)
+        attn_backend = attn_layers[any_layer_name].get_attn_backend()
+        for layer_name in layer_names:
+            attn_backends[layer_name] = attn_backend
+
+        attn_metadata_builder = attn_backend.get_builder_cls()(
+            kv_cache_group_spec.kv_cache_spec,
+            layer_names,
+            vllm_config,
+            device,
+        )
+        attn_metadata_builders.append(attn_metadata_builder)  # type: ignore
+
+        if "FLASHINFER" in attn_backend.get_name():
+            if flashinfer_workspace is None:
+                flashinfer_workspace = attn_metadata_builder._get_workspace_buffer()
+            else:
+                attn_metadata_builder.set_workspace_buffer(flashinfer_workspace)
+    return attn_backends, attn_metadata_builders
+
+
+def _allocate_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    device: torch.device,
+):
+    kv_cache_raw_tensors: dict[str, torch.Tensor] = {}
+    for kv_cache_tensor in kv_cache_config.kv_cache_tensors:
+        tensor = torch.zeros(kv_cache_tensor.size, dtype=torch.int8, device=device)
+        for layer_name in kv_cache_tensor.shared_by:
+            kv_cache_raw_tensors[layer_name] = tensor
+
+    layer_names = set()
+    for group in kv_cache_config.kv_cache_groups:
+        for layer_name in group.layer_names:
+            layer_names.add(layer_name)
+    assert layer_names == set(kv_cache_raw_tensors.keys()), (
+        "Some layers are not correctly initialized"
+    )
+    return kv_cache_raw_tensors
+
+
+def _reshape_kv_cache(
+    kv_cache_config: KVCacheConfig,
+    kv_cache_raw_tensors: dict[str, torch.Tensor],
+    attn_backends: dict[str, AttentionBackend],
+) -> dict[str, torch.Tensor]:
+    kv_caches: dict[str, torch.Tensor] = {}
+    for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
+        kv_cache_spec = kv_cache_group_spec.kv_cache_spec
+        assert isinstance(kv_cache_spec, AttentionSpec)
+        for layer_name in kv_cache_group_spec.layer_names:
+            raw_tensor = kv_cache_raw_tensors[layer_name]
+            assert raw_tensor.numel() % kv_cache_spec.page_size_bytes == 0
+            num_blocks = raw_tensor.numel() // kv_cache_spec.page_size_bytes
+
+            attn_backend = attn_backends[layer_name]
+            kv_cache_shape = attn_backend.get_kv_cache_shape(
+                num_blocks,
+                kv_cache_spec.block_size,
+                kv_cache_spec.num_kv_heads,
+                kv_cache_spec.head_size,
+            )
+
+            # FIXME(woosuk): Add kv_cache_stride_order to all attention backends.
+            try:
+                kv_cache_stride_order = attn_backend.get_kv_cache_stride_order()
+                assert len(kv_cache_stride_order) == len(kv_cache_shape)
+            except (AttributeError, NotImplementedError):
+                kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+            kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+            inv_order = [
+                kv_cache_stride_order.index(i)
+                for i in range(len(kv_cache_stride_order))
+            ]
+
+            dtype = kv_cache_spec.dtype
+            raw_tensor = raw_tensor.view(dtype)
+            raw_tensor = raw_tensor.view(kv_cache_shape)
+            kv_caches[layer_name] = raw_tensor.permute(*inv_order)
+    return kv_caches
+
+
+def init_kv_cache(
+    runner_kv_caches: list[torch.Tensor],
+    forward_context: dict[str, Any],
+    kv_cache_config: KVCacheConfig,
+    attn_backends: dict[str, AttentionBackend],
+    device: torch.device,
+) -> None:
+    kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
+    kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors, attn_backends)
+    bind_kv_cache(kv_caches, forward_context, runner_kv_caches)
+
+
+def build_attn_metadata(
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    num_reqs: int,
+    num_tokens: int,
+    query_start_loc_gpu: torch.Tensor,
+    query_start_loc_cpu: torch.Tensor,
+    seq_lens: torch.Tensor,
+    seq_lens_np: np.ndarray,
+    num_computed_tokens_cpu: torch.Tensor | None,
+    block_tables: Sequence[torch.Tensor],
+    slot_mappings: torch.Tensor,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    max_query_len = int(query_start_loc_cpu.max())
+    seq_lens = seq_lens[:num_reqs]
+    seq_lens_cpu = torch.from_numpy(seq_lens_np)
+    max_seq_len = int(seq_lens_np.max())
+
+    attn_metadata: dict[str, Any] = {}
+    kv_cache_groups = kv_cache_config.kv_cache_groups
+    for i, kv_cache_spec in enumerate(kv_cache_groups):
+        block_table = block_tables[i]
+        slot_mapping = slot_mappings[i]
+
+        common_attn_metadata = CommonAttentionMetadata(
+            query_start_loc=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=seq_lens,
+            seq_lens_cpu=seq_lens_cpu,
+            max_seq_len=max_seq_len,
+            num_computed_tokens_cpu=num_computed_tokens_cpu,
+            num_reqs=num_reqs,
+            num_actual_tokens=num_tokens,
+            max_query_len=max_query_len,
+            block_table_tensor=block_table,
+            slot_mapping=slot_mapping,
+            causal=True,
+        )
+
+        attn_metadata_builder = attn_metadata_builders[i]
+        metadata = attn_metadata_builder.build(
+            common_prefix_len=0,
+            common_attn_metadata=common_attn_metadata,
+        )
+        for layer_name in kv_cache_spec.layer_names:
+            attn_metadata[layer_name] = metadata
+    return attn_metadata
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
new file mode 100644
index 000000000000..b31e9b179d26
--- /dev/null
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -0,0 +1,314 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterable
+
+import torch
+
+from vllm.attention.backends.utils import PAD_SLOT_ID
+from vllm.triton_utils import tl, triton
+from vllm.utils.math_utils import cdiv
+from vllm.v1.utils import CpuGpuBuffer
+
+
+class BlockTables:
+    def __init__(
+        self,
+        block_sizes: list[int],
+        max_num_reqs: int,
+        max_num_batched_tokens: int,
+        max_model_len: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.block_sizes = block_sizes
+        self.max_num_reqs = max_num_reqs
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.max_model_len = max_model_len
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.num_kv_cache_groups = len(self.block_sizes)
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.block_tables: list[torch.Tensor] = []
+        for i in range(self.num_kv_cache_groups):
+            block_size = self.block_sizes[i]
+            max_num_blocks = cdiv(self.max_model_len, block_size)
+            block_table = torch.zeros(
+                self.max_num_reqs,
+                max_num_blocks,
+                dtype=torch.int32,
+                device=self.device,
+            )
+            self.block_tables.append(block_table)
+        self.block_table_ptrs = self._make_ptr_tensor(self.block_tables)
+
+        # Block tables used for model's forward pass.
+        # num_kv_cache_groups x [max_num_reqs, max_num_blocks]
+        self.input_block_tables: list[torch.Tensor] = [
+            torch.zeros_like(block_table) for block_table in self.block_tables
+        ]
+        self.input_block_table_ptrs = self._make_ptr_tensor(self.input_block_tables)
+
+        self.block_table_strides = torch.tensor(
+            [b.stride(0) for b in self.block_tables],
+            dtype=torch.int64,
+            device=self.device,
+        )
+        self.block_sizes_tensor = torch.tensor(
+            self.block_sizes, dtype=torch.int32, device=self.device
+        )
+        self.num_blocks = torch.zeros(
+            self.num_kv_cache_groups,
+            self.max_num_reqs,
+            dtype=torch.int32,
+            device=self.device,
+        )
+        self.slot_mappings = torch.zeros(
+            self.num_kv_cache_groups,
+            self.max_num_batched_tokens,
+            dtype=torch.int64,
+            device=self.device,
+        )
+
+        # Misc buffers.
+        self.req_indices = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.overwrite = self._make_buffer(self.max_num_reqs, dtype=torch.bool)
+        self.cu_num_new_blocks = self._make_buffer(
+            self.num_kv_cache_groups, self.max_num_reqs + 1, dtype=torch.int32
+        )
+
+    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
+        )
+
+    def _make_ptr_tensor(self, x: Iterable[torch.Tensor]) -> torch.Tensor:
+        # NOTE(woosuk): Use uint64 instead of int64 to cover all possible addresses.
+        ptrs_tensor_cpu = torch.tensor(
+            [t.data_ptr() for t in x],
+            dtype=torch.uint64,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+        return ptrs_tensor_cpu.to(self.device, non_blocking=True)
+
+    def append_block_ids(
+        self,
+        # [num_reqs]
+        req_indices: list[int],
+        # [num_kv_cache_groups, num_reqs + 1]
+        cu_num_new_blocks: tuple[list[int], ...],
+        # [num_kv_cache_groups, num_new_blocks]
+        new_block_ids: tuple[list[int], ...],
+        # [num_reqs]
+        overwrite: list[bool],
+    ) -> None:
+        num_reqs = len(req_indices)
+        self.req_indices.np[:num_reqs] = req_indices
+        self.overwrite.np[:num_reqs] = overwrite
+        for i in range(self.num_kv_cache_groups):
+            self.cu_num_new_blocks.np[i, : num_reqs + 1] = cu_num_new_blocks[i]
+
+        # NOTE(woosuk): Here, we cannot use a fixed-size buffer because there's
+        # no clear upper bound to the number of new blocks in a single step.
+        # NOTE(woosuk): The buffer has to be cached, because otherwise we cannot
+        # guarantee that the buffer is not freed before the copy is completed.
+        self.new_block_ids_cpu = torch.empty(
+            self.num_kv_cache_groups,
+            max(len(x) for x in new_block_ids),
+            dtype=torch.int32,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+        new_block_ids_np = self.new_block_ids_cpu.numpy()
+        for i in range(self.num_kv_cache_groups):
+            new_block_ids_np[i, : len(new_block_ids[i])] = new_block_ids[i]
+        new_block_ids_gpu = self.new_block_ids_cpu.to(self.device, non_blocking=True)
+
+        _append_block_ids_kernel[(self.num_kv_cache_groups, num_reqs)](
+            self.req_indices.copy_to_gpu(num_reqs),
+            self.cu_num_new_blocks.copy_to_gpu(),
+            self.cu_num_new_blocks.gpu.stride(0),
+            new_block_ids_gpu,
+            new_block_ids_gpu.stride(0),
+            self.overwrite.copy_to_gpu(num_reqs),
+            self.block_table_strides,
+            self.block_table_ptrs,
+            self.num_blocks,
+            self.num_blocks.stride(0),
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+
+    def gather_block_tables(
+        self,
+        idx_mapping: torch.Tensor,
+    ) -> tuple[torch.Tensor, ...]:
+        num_reqs = idx_mapping.shape[0]
+        _gather_block_tables_kernel[(self.num_kv_cache_groups, num_reqs)](
+            idx_mapping,
+            self.block_table_ptrs,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.num_blocks,
+            self.num_blocks.stride(0),
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+
+    def get_dummy_block_tables(self, num_reqs: int) -> tuple[torch.Tensor, ...]:
+        return tuple(block_table[:num_reqs] for block_table in self.input_block_tables)
+
+    def compute_slot_mappings(
+        self,
+        query_start_loc: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        num_reqs = query_start_loc.shape[0] - 1
+        num_tokens = positions.shape[0]
+        num_groups = self.num_kv_cache_groups
+        _compute_slot_mappings_kernel[(num_groups, num_reqs + 1)](
+            num_tokens,
+            self.max_num_batched_tokens,
+            query_start_loc,
+            positions,
+            self.input_block_table_ptrs,
+            self.block_table_strides,
+            self.block_sizes_tensor,
+            self.slot_mappings,
+            self.slot_mappings.stride(0),
+            PAD_ID=PAD_SLOT_ID,
+            BLOCK_SIZE=1024,  # type: ignore
+        )
+        return self.slot_mappings[:, :num_tokens]
+
+    def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
+        self.slot_mappings.fill_(PAD_SLOT_ID)
+        return self.slot_mappings[:, :num_tokens]
+
+
+@triton.jit
+def _append_block_ids_kernel(
+    # Inputs
+    req_indices,  # [num_reqs]
+    cu_num_new_blocks_ptr,  # [num_kv_cache_groups, num_reqs + 1]
+    cu_num_new_blocks_stride,
+    new_block_ids_ptr,  # [num_kv_cache_groups, num_new_blocks]
+    new_block_ids_stride,
+    overwrite,  # [num_reqs]
+    block_table_strides,  # [num_kv_cache_groups]
+    # Outputs
+    block_table_ptrs,  # [num_kv_cache_groups]
+    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
+    num_blocks_stride,
+    # Constants
+    BLOCK_SIZE: tl.constexpr,
+):
+    group_id = tl.program_id(0)
+    batch_idx = tl.program_id(1)
+    req_idx = tl.load(req_indices + batch_idx)
+    do_overwrite = tl.load(overwrite + batch_idx)
+
+    group_new_blocks_ptr = cu_num_new_blocks_ptr + group_id * cu_num_new_blocks_stride
+    start_idx = tl.load(group_new_blocks_ptr + batch_idx)
+    end_idx = tl.load(group_new_blocks_ptr + batch_idx + 1)
+    num_new_blocks = end_idx - start_idx
+
+    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
+    dst_start_idx = tl.load(group_num_blocks_ptr + req_idx) if not do_overwrite else 0
+    dst_end_idx = dst_start_idx + num_new_blocks
+    tl.store(group_num_blocks_ptr + req_idx, dst_end_idx)
+
+    # Destination
+    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
+    block_table_stride = tl.load(block_table_strides + group_id)
+    row_ptr = block_table_ptr + req_idx * block_table_stride
+
+    group_new_block_ids_ptr = new_block_ids_ptr + group_id * new_block_ids_stride
+    for i in range(0, num_new_blocks, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        block_ids = tl.load(
+            group_new_block_ids_ptr + start_idx + offset, mask=offset < num_new_blocks
+        )
+        tl.store(
+            row_ptr + dst_start_idx + offset, block_ids, mask=offset < num_new_blocks
+        )
+
+
+@triton.jit
+def _gather_block_tables_kernel(
+    batch_idx_to_req_idx,  # [batch_size]
+    src_block_table_ptrs,  # [num_kv_cache_groups]
+    dst_block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    num_blocks_ptr,  # [num_kv_cache_groups, max_num_reqs]
+    num_blocks_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    batch_idx = tl.program_id(1)
+    req_idx = tl.load(batch_idx_to_req_idx + batch_idx)
+
+    group_num_blocks_ptr = num_blocks_ptr + group_id * num_blocks_stride
+    num_blocks = tl.load(group_num_blocks_ptr + req_idx)
+
+    stride = tl.load(block_table_strides + group_id)
+    src_block_table_ptr = _load_ptr(src_block_table_ptrs + group_id, tl.int32)
+    src_row_ptr = src_block_table_ptr + req_idx * stride
+    dst_block_table_ptr = _load_ptr(dst_block_table_ptrs + group_id, tl.int32)
+    dst_row_ptr = dst_block_table_ptr + batch_idx * stride
+
+    for i in tl.range(0, num_blocks, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        block_ids = tl.load(src_row_ptr + offset, mask=offset < num_blocks)
+        tl.store(dst_row_ptr + offset, block_ids, mask=offset < num_blocks)
+
+
+@triton.jit
+def _compute_slot_mappings_kernel(
+    num_tokens,
+    max_num_tokens,
+    cu_num_tokens,  # [num_reqs + 1]
+    pos,  # [num_tokens]
+    block_table_ptrs,  # [num_kv_cache_groups]
+    block_table_strides,  # [num_kv_cache_groups]
+    page_sizes,  # [num_kv_cache_groups]
+    slot_mappings_ptr,  # [num_kv_cache_groups, max_num_tokens]
+    slot_mappings_stride,
+    PAD_ID: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    # kv cache group id
+    group_id = tl.program_id(0)
+    req_idx = tl.program_id(1)
+    slot_mapping_ptr = slot_mappings_ptr + group_id * slot_mappings_stride
+
+    if req_idx == tl.num_programs(1) - 1:
+        # Pad remaining slots to -1. This is needed for CUDA graphs.
+        for i in range(num_tokens, max_num_tokens, BLOCK_SIZE):
+            offset = i + tl.arange(0, BLOCK_SIZE)
+            tl.store(slot_mapping_ptr + offset, PAD_ID, mask=offset < max_num_tokens)
+        return
+
+    block_table_ptr = _load_ptr(block_table_ptrs + group_id, tl.int32)
+    block_table_stride = tl.load(block_table_strides + group_id)
+    page_size = tl.load(page_sizes + group_id)
+
+    start_idx = tl.load(cu_num_tokens + req_idx)
+    end_idx = tl.load(cu_num_tokens + req_idx + 1)
+    for i in range(start_idx, end_idx, BLOCK_SIZE):
+        offset = i + tl.arange(0, BLOCK_SIZE)
+        positions = tl.load(pos + offset, mask=offset < end_idx, other=0)
+        block_indices = positions // page_size
+        block_numbers = tl.load(
+            block_table_ptr + req_idx * block_table_stride + block_indices
+        )
+        slot_ids = block_numbers * page_size + positions % page_size
+        tl.store(slot_mapping_ptr + offset, slot_ids, mask=offset < end_idx)
+
+
+@triton.jit
+def _load_ptr(ptr_to_ptr, elem_dtype):
+    ptr = tl.load(ptr_to_ptr)
+    ptr = tl.cast(ptr, tl.pointer_type(elem_dtype))
+    return tl.multiple_of(ptr, 16)
diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py
new file mode 100644
index 000000000000..7f2994eeca00
--- /dev/null
+++ b/vllm/v1/worker/gpu/cudagraph_utils.py
@@ -0,0 +1,259 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable, Iterable
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.distributed.parallel_state import graph_capture, is_global_first_rank
+from vllm.forward_context import set_forward_context
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class CudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device = device
+
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+        self.cudagraph_mode: CUDAGraphMode
+        if self.compilation_config.cudagraph_mode is None:
+            self.cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            self.cudagraph_mode = self.compilation_config.cudagraph_mode
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
+
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+        self.hidden_states: torch.Tensor | None = None
+
+    def needs_capture(self) -> bool:
+        return len(self.cudagraph_sizes) > 0
+
+    def get_cudagraph_size(
+        self,
+        scheduler_output: SchedulerOutput,
+        num_tokens_after_padding: int,
+    ) -> int | None:
+        return get_cudagraph_size(
+            num_tokens_after_padding,
+            scheduler_output.num_scheduled_tokens.values(),
+            self.cudagraph_sizes,
+            self.cudagraph_mode,
+        )
+
+    def capture_graph(
+        self,
+        num_tokens: int,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_ids = input_buffers.input_ids[:num_tokens]
+        positions = input_buffers.positions[:num_tokens]
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
+        )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+
+        # Warm up.
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            if self.hidden_states is None:
+                self.hidden_states = torch.empty_like(hidden_states)
+
+        # Capture the graph.
+        assert num_tokens not in self.graphs
+        graph = torch.cuda.CUDAGraph()
+        with (
+            set_forward_context(
+                attn_metadata,
+                self.vllm_config,
+                num_tokens=num_tokens,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ),
+            torch.cuda.graph(graph, self.pool),
+        ):
+            hidden_states = model(
+                input_ids=input_ids,
+                positions=positions,
+            )
+            self.hidden_states[:num_tokens] = hidden_states
+        self.graphs[num_tokens] = graph
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        model: nn.Module,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            model=model,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
+
+    def run(self, num_tokens: int) -> torch.Tensor:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()
+        assert self.hidden_states is not None
+        return self.hidden_states[:num_tokens]
+
+
+def get_cudagraph_sizes(
+    capture_sizes: list[int] | None,
+    max_num_reqs: int,
+    max_num_tokens: int,
+    cudagraph_mode: CUDAGraphMode,
+) -> dict[int, int]:
+    if not cudagraph_mode.has_full_cudagraphs():
+        return {}
+    if not capture_sizes:
+        return {}
+
+    capture_sizes = sorted(capture_sizes)
+    # Limit the capture sizes to the max number of requests or tokens.
+    upper_bound = (
+        max_num_reqs
+        if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY
+        else max_num_tokens
+    )
+    capture_sizes = [x for x in capture_sizes if x <= upper_bound]
+    if not capture_sizes:
+        return {}
+
+    cudagraph_sizes: dict[int, int] = {}
+    for i in range(1, capture_sizes[-1] + 1):
+        for x in capture_sizes:
+            if i <= x:
+                cudagraph_sizes[i] = x
+                break
+    return cudagraph_sizes
+
+
+def get_cudagraph_size(
+    num_tokens_after_dp_padding: int,
+    num_tokens_per_request: Iterable[int],
+    cudagraph_sizes: dict[int, int],
+    cudagraph_mode: CUDAGraphMode,
+) -> int | None:
+    size = cudagraph_sizes.get(num_tokens_after_dp_padding)
+    if size is None:
+        # No CUDA graph for this size.
+        return None
+    if cudagraph_mode == CUDAGraphMode.FULL_DECODE_ONLY:
+        all_decode = all(x == 1 for x in num_tokens_per_request)
+        if not all_decode:
+            # Prefill is included.
+            return None
+    return size
+
+
+def capture_graphs(
+    cudagraph_sizes: dict[int, int],
+    device: torch.device,
+    capture_fn: Callable,
+    **capture_kwargs,
+) -> None:
+    # Capture larger graphs first.
+    sizes_to_capture = sorted(set(cudagraph_sizes.values()), reverse=True)
+    if is_global_first_rank():
+        sizes_to_capture = tqdm(sizes_to_capture, desc="Capturing CUDA graphs")
+
+    with graph_capture(device=device):
+        for size in sizes_to_capture:
+            capture_fn(size, **capture_kwargs)
+
+
+def prepare_inputs_to_capture(
+    num_reqs: int,
+    num_tokens: int,
+    input_buffers: InputBuffers,
+    block_tables: BlockTables,
+    attn_metadata_builders: list[AttentionMetadataBuilder],
+    max_model_len: int,
+    kv_cache_config: KVCacheConfig,
+) -> dict[str, Any]:
+    num_tokens_per_req = num_tokens // num_reqs
+    query_start_loc = input_buffers.query_start_loc
+    query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1) * num_tokens_per_req
+    query_start_loc.np[num_reqs:] = num_tokens
+    query_start_loc.copy_to_gpu()
+    seq_lens_np = np.full(num_reqs, max_model_len, dtype=np.int32)
+    # HACK(woosuk): For faster warmup, we set seq_lens (GPU) to num_tokens
+    # rather than max_model_len. This introduces a discrepancy between
+    # seq_lens (on GPU) and seq_lens_np (on CPU), which may cause issues for
+    # certain attention backends.
+    input_buffers.seq_lens[:num_reqs] = num_tokens
+    input_buffers.seq_lens[num_reqs:] = 0
+
+    input_block_tables = [x[:num_reqs] for x in block_tables.input_block_tables]
+    slot_mappings = block_tables.slot_mappings[:, :num_tokens]
+
+    attn_metadata = build_attn_metadata(
+        attn_metadata_builders=attn_metadata_builders,
+        num_reqs=num_reqs,
+        num_tokens=num_tokens,
+        query_start_loc_gpu=query_start_loc.gpu[: num_reqs + 1],
+        query_start_loc_cpu=query_start_loc.cpu[: num_reqs + 1],
+        seq_lens=input_buffers.seq_lens,
+        seq_lens_np=seq_lens_np,
+        num_computed_tokens_cpu=None,  # FIXME
+        block_tables=input_block_tables,
+        slot_mappings=slot_mappings,
+        kv_cache_config=kv_cache_config,
+    )
+    return attn_metadata
diff --git a/vllm/v1/worker/gpu/dp_utils.py b/vllm/v1/worker/gpu/dp_utils.py
new file mode 100644
index 000000000000..d71d91d1e5cb
--- /dev/null
+++ b/vllm/v1/worker/gpu/dp_utils.py
@@ -0,0 +1,31 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.distributed as dist
+
+from vllm.distributed.parallel_state import get_dp_group
+
+
+def get_batch_metadata_across_dp(
+    num_tokens: int,
+    cudagraph_size: int,
+    dp_size: int,
+    dp_rank: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    assert dp_size > 1
+    # Use CPU group to avoid CPU-GPU synchronization.
+    group = get_dp_group().cpu_group
+    tensor = torch.zeros(2, dp_size, dtype=torch.int32, device="cpu")
+    tensor[0][dp_rank] = num_tokens
+    tensor[1][dp_rank] = cudagraph_size
+    dist.all_reduce(tensor, group=group)
+    return tensor[0], tensor[1]
+
+
+def make_num_tokens_across_dp(
+    dp_size: int,
+    num_tokens: int,
+) -> torch.Tensor | None:
+    if dp_size == 1:
+        return None
+    return torch.full((dp_size,), num_tokens, dtype=torch.int32, device="cpu")
diff --git a/vllm/v1/worker/gpu/input_batch.py b/vllm/v1/worker/gpu/input_batch.py
new file mode 100644
index 000000000000..8ae887fe82cf
--- /dev/null
+++ b/vllm/v1/worker/gpu/input_batch.py
@@ -0,0 +1,430 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+from typing import Any
+
+import numpy as np
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.utils import random_uuid
+from vllm.utils.math_utils import cdiv
+from vllm.v1.utils import CpuGpuBuffer
+
+
+class InputBuffers:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_num_tokens: int,
+        inputs_embeds_size: int,
+        vocab_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_num_tokens = max_num_tokens
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.idx_mapping = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.input_ids = torch.zeros(max_num_tokens, dtype=torch.int32, device=device)
+        self.positions = torch.zeros(max_num_tokens, dtype=torch.int64, device=device)
+        self.query_start_loc = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
+        self.seq_lens = torch.zeros(max_num_reqs, dtype=torch.int32, device=device)
+        self.cu_num_logits = self._make_buffer(max_num_reqs + 1, dtype=torch.int32)
+
+        # Structured outputs.
+        self.bitmask_indices = self._make_buffer(max_num_reqs, dtype=torch.int32)
+        self.grammar_bitmask = self._make_buffer(
+            max_num_reqs, cdiv(vocab_size, 32), dtype=torch.int32
+        )
+
+    def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            *args, dtype=dtype, pin_memory=self.pin_memory, device=self.device
+        )
+
+
+@dataclass
+class InputBatch:
+    # batch_idx -> req_id
+    req_ids: list[str]
+    num_reqs: int
+
+    # batch_idx -> req_state_idx
+    idx_mapping: torch.Tensor
+    idx_mapping_np: np.ndarray
+
+    # [num_reqs]
+    # batch_idx -> num_scheduled_tokens
+    num_scheduled_tokens: np.ndarray
+    # sum(num_scheduled_tokens)
+    num_tokens: int
+    num_tokens_after_padding: int
+    num_draft_tokens: int
+
+    # [num_reqs + 1]
+    query_start_loc: torch.Tensor
+    query_start_loc_np: np.ndarray
+    # [num_reqs]
+    seq_lens: torch.Tensor
+    seq_lens_np: np.ndarray
+
+    # [num_tokens_after_padding]
+    input_ids: torch.Tensor
+    # [num_tokens_after_padding]
+    positions: torch.Tensor
+
+    # layer_name -> Metadata
+    attn_metadata: dict[str, Any]
+
+    # [total_num_logits]
+    logits_indices: torch.Tensor
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        num_tokens: int,
+        input_buffers: InputBuffers,
+        device: torch.device,
+    ) -> "InputBatch":
+        assert 0 < num_reqs <= num_tokens
+        req_ids = [f"req_{i}_{random_uuid()}" for i in range(num_reqs)]
+        idx_mapping_np = np.arange(num_reqs, dtype=np.int32)
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        num_scheduled_tokens = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
+        num_scheduled_tokens[-1] += num_tokens % num_reqs
+        assert int(num_scheduled_tokens.sum()) == num_tokens
+
+        input_buffers.query_start_loc.np[0] = 0
+        input_buffers.query_start_loc.np[1 : num_reqs + 1] = np.cumsum(
+            num_scheduled_tokens
+        )
+        input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+        query_start_loc_np = input_buffers.query_start_loc.np[: num_reqs + 1]
+        query_start_loc = input_buffers.query_start_loc.copy_to_gpu()[: num_reqs + 1]
+        # seq_len equals to query_len
+        seq_lens_np = np.full(num_reqs, num_tokens // num_reqs, dtype=np.int32)
+        seq_lens_np[-1] += num_tokens % num_reqs
+        input_buffers.seq_lens[:num_reqs] = num_tokens // num_reqs
+        input_buffers.seq_lens[num_reqs - 1] += num_tokens % num_reqs
+        input_buffers.seq_lens[num_reqs:] = 0
+        seq_lens = input_buffers.seq_lens[:num_reqs]
+
+        input_ids = input_buffers.input_ids[:num_tokens]
+        positions = input_buffers.positions[:num_tokens]
+        # attn_metadata = defaultdict(lambda: None)
+        logits_indices = query_start_loc[1:] - 1
+        cu_num_logits = torch.arange(num_reqs + 1, device=device, dtype=torch.int32)
+        return cls(
+            req_ids=req_ids,
+            num_reqs=num_reqs,
+            idx_mapping=idx_mapping,
+            idx_mapping_np=idx_mapping_np,
+            num_scheduled_tokens=num_scheduled_tokens,
+            num_tokens=num_tokens,
+            num_tokens_after_padding=num_tokens,
+            num_draft_tokens=0,
+            query_start_loc=query_start_loc,
+            query_start_loc_np=query_start_loc_np,
+            seq_lens=seq_lens,
+            seq_lens_np=seq_lens_np,
+            input_ids=input_ids,
+            positions=positions,
+            attn_metadata=None,  # type: ignore
+            logits_indices=logits_indices,
+            cu_num_logits=cu_num_logits,
+        )
+
+
+@triton.jit
+def _prepare_prefill_inputs_kernel(
+    input_ids_ptr,
+    next_prefill_tokens_ptr,
+    idx_mapping_ptr,
+    query_start_loc_ptr,
+    prefill_token_ids_ptr,
+    prefill_token_ids_stride,
+    prefill_lens_ptr,
+    num_computed_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+    prefill_len = tl.load(prefill_lens_ptr + req_state_idx)
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    if num_computed >= prefill_len:
+        # Not prefill.
+        return
+
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    prefill_ptr = prefill_token_ids_ptr + req_state_idx * prefill_token_ids_stride
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        tokens = tl.load(prefill_ptr + num_computed + block, mask=mask)
+        tl.store(input_ids_ptr + query_start + block, tokens, mask=mask)
+
+    next_pos = num_computed + query_len
+    if next_pos < prefill_len:
+        next_token = tl.load(prefill_ptr + next_pos)
+        tl.store(next_prefill_tokens_ptr + req_state_idx, next_token)
+
+
+def prepare_prefill_inputs(
+    input_ids: torch.Tensor,
+    next_prefill_tokens: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    prefill_token_ids: torch.Tensor,
+    prefill_len: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    _prepare_prefill_inputs_kernel[(num_reqs,)](
+        input_ids,
+        next_prefill_tokens,
+        idx_mapping,
+        query_start_loc,
+        prefill_token_ids,
+        prefill_token_ids.stride(0),
+        prefill_len,
+        num_computed_tokens,
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _prepare_pos_seq_lens_kernel(
+    pos_ptr,
+    seq_lens_ptr,
+    idx_mapping_ptr,
+    query_start_loc_ptr,
+    num_computed_tokens_ptr,
+    max_num_reqs,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_id = tl.program_id(0)
+    num_reqs = tl.num_programs(0) - 1
+    if req_id == num_reqs:
+        # Pad unused seq_lens as 0 for full CUDA graphs.
+        for i in tl.range(num_reqs, max_num_reqs, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < max_num_reqs
+            tl.store(seq_lens_ptr + block, 0, mask=mask)
+        return
+
+    req_state_idx = tl.load(idx_mapping_ptr + req_id)
+    num_computed_tokens = tl.load(num_computed_tokens_ptr + req_state_idx)
+
+    start = tl.load(query_start_loc_ptr + req_id)
+    end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = end - start
+
+    seq_len = num_computed_tokens + query_len
+    tl.store(seq_lens_ptr + req_id, seq_len)
+
+    for i in tl.range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        pos = num_computed_tokens + block
+        tl.store(pos_ptr + start + block, pos, mask=mask)
+
+
+def prepare_pos_seq_lens(
+    idx_mapping: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    num_computed_tokens: torch.Tensor,
+    pos: torch.Tensor,
+    seq_lens: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    # NOTE(woosuk): We do +1 because the last thread block is used
+    # to pad unused seq_lens as 0 for full CUDA graphs.
+    _prepare_pos_seq_lens_kernel[(num_reqs + 1,)](
+        pos,
+        seq_lens,
+        idx_mapping,
+        query_start_loc,
+        num_computed_tokens,
+        seq_lens.shape[0],
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _combine_sampled_and_draft_tokens_kernel(
+    input_ids_ptr,
+    idx_mapping_ptr,
+    last_sampled_tokens_ptr,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    prefill_len_ptr,
+    draft_tokens_ptr,
+    draft_tokens_stride,
+    cu_num_logits_ptr,
+    logits_indices_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+
+    # Get the number of logits and draft tokens.
+    cu_num_logits_start = tl.load(cu_num_logits_ptr + batch_idx)
+    cu_num_logits_end = tl.load(cu_num_logits_ptr + batch_idx + 1)
+    num_logits = cu_num_logits_end - cu_num_logits_start
+    num_draft_tokens = num_logits - 1
+
+    # Compute the logits indices.
+    block = tl.arange(0, BLOCK_SIZE)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    logits_start = query_end - num_logits
+    tl.store(
+        logits_indices_ptr + cu_num_logits_start + block,
+        logits_start + block,
+        mask=block < num_logits,
+    )
+
+    seq_len = tl.load(seq_lens_ptr + batch_idx)
+    prefill_len = tl.load(prefill_len_ptr + req_state_idx)
+    if seq_len <= prefill_len:
+        # Handling prefill tokens. No sampled or draft tokens.
+        return
+
+    # Write the last sampled token ID to input_ids.
+    last_token_id = tl.load(last_sampled_tokens_ptr + req_state_idx)
+    tl.store(input_ids_ptr + query_end - num_logits, last_token_id)
+
+    # Write the draft tokens (if any) to input_ids.
+    if num_draft_tokens > 0:
+        mask = block < num_draft_tokens
+        draft_tokens = tl.load(
+            draft_tokens_ptr + req_state_idx * draft_tokens_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_ids_ptr + query_end - num_draft_tokens + block,
+            draft_tokens,
+            mask=mask,
+        )
+
+
+def combine_sampled_and_draft_tokens(
+    input_ids: torch.Tensor,
+    idx_mapping: torch.Tensor,
+    last_sampled_tokens: torch.Tensor,
+    query_start_loc: torch.Tensor,
+    seq_lens: torch.Tensor,
+    prefill_len: torch.Tensor,
+    draft_tokens: torch.Tensor,
+    cu_num_logits: torch.Tensor,
+    num_logits: int,
+) -> torch.Tensor:
+    num_reqs = seq_lens.shape[0]
+    num_speculative_steps = draft_tokens.shape[-1]
+
+    logits_indices = torch.empty(
+        num_logits,
+        dtype=torch.int64,
+        device=input_ids.device,
+    )
+    _combine_sampled_and_draft_tokens_kernel[(num_reqs,)](
+        input_ids,
+        idx_mapping,
+        last_sampled_tokens,
+        query_start_loc,
+        seq_lens,
+        prefill_len,
+        draft_tokens,
+        draft_tokens.stride(0),
+        cu_num_logits,
+        logits_indices,
+        # NOTE(woosuk): Add 1 to ensure the block can cover the last sampled token
+        # in addition to all draft tokens.
+        BLOCK_SIZE=triton.next_power_of_2(num_speculative_steps + 1),
+    )
+    return logits_indices
+
+
+@triton.jit
+def _post_update_kernel(
+    idx_mapping_ptr,
+    num_computed_tokens_ptr,
+    last_sampled_tokens_ptr,
+    output_bin_counts_ptr,
+    output_bin_counts_stride,
+    sampled_tokens_ptr,
+    sampled_tokens_stride,
+    num_sampled_ptr,
+    num_rejected_ptr,
+    query_start_loc_ptr,
+):
+    req_id = tl.program_id(0)
+    req_state_idx = tl.load(idx_mapping_ptr + req_id)
+
+    num_sampled = tl.load(num_sampled_ptr + req_id)
+    if num_sampled > 0:
+        token_id = tl.load(
+            sampled_tokens_ptr + req_id * sampled_tokens_stride + num_sampled - 1
+        )
+        tl.store(last_sampled_tokens_ptr + req_state_idx, token_id)
+
+    for i in range(num_sampled):
+        token_id = tl.load(sampled_tokens_ptr + req_id * sampled_tokens_stride + i)
+        token_ptr = (
+            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + token_id
+        )
+        count = tl.load(token_ptr)
+        count += 1
+        tl.store(token_ptr, count)
+
+    query_start = tl.load(query_start_loc_ptr + req_id)
+    query_end = tl.load(query_start_loc_ptr + req_id + 1)
+    query_len = query_end - query_start
+    num_rejected = tl.load(num_rejected_ptr + req_id)
+
+    num_computed = tl.load(num_computed_tokens_ptr + req_state_idx)
+    num_computed += query_len - num_rejected
+    tl.store(num_computed_tokens_ptr + req_state_idx, num_computed)
+
+
+def post_update(
+    # [num_reqs]
+    idx_mapping: torch.Tensor,
+    # [max_num_reqs]
+    num_computed_tokens: torch.Tensor,
+    # [max_num_reqs]
+    last_sampled_tokens: torch.Tensor,
+    # [max_num_reqs, vocab_size]
+    output_bin_counts: torch.Tensor,
+    # [num_reqs, num_speculative_steps + 1]
+    sampled_tokens: torch.Tensor,
+    # [num_reqs]
+    num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
+    # [num_reqs + 1]
+    query_start_loc: torch.Tensor,
+) -> None:
+    num_reqs = idx_mapping.shape[0]
+    _post_update_kernel[(num_reqs,)](
+        idx_mapping,
+        num_computed_tokens,
+        last_sampled_tokens,
+        output_bin_counts,
+        output_bin_counts.stride(0),
+        sampled_tokens,
+        sampled_tokens.stride(0),
+        num_sampled,
+        num_rejected,
+        query_start_loc,
+        num_warps=1,
+    )
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
new file mode 100644
index 000000000000..9bf345053c30
--- /dev/null
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -0,0 +1,1007 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import time
+from copy import deepcopy
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model_loader
+from vllm.utils.mem_constants import GiB_bytes
+from vllm.utils.mem_utils import DeviceMemoryProfiler
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.outputs import (
+    EMPTY_MODEL_RUNNER_OUTPUT,
+    LogprobsTensors,
+    ModelRunnerOutput,
+)
+from vllm.v1.sample.sampler import SamplerOutput
+from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier
+from vllm.v1.worker.gpu.attn_utils import (
+    build_attn_metadata,
+    get_kv_cache_spec,
+    init_attn_backend,
+    init_kv_cache,
+)
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import CudaGraphManager
+from vllm.v1.worker.gpu.dp_utils import (
+    get_batch_metadata_across_dp,
+    make_num_tokens_across_dp,
+)
+from vllm.v1.worker.gpu.input_batch import (
+    InputBatch,
+    InputBuffers,
+    combine_sampled_and_draft_tokens,
+    post_update,
+    prepare_pos_seq_lens,
+    prepare_prefill_inputs,
+)
+from vllm.v1.worker.gpu.sample.logprob import compute_prompt_logprobs
+from vllm.v1.worker.gpu.sample.metadata import (
+    SamplingMetadata,
+    expand_sampling_metadata,
+)
+from vllm.v1.worker.gpu.sample.sampler import Sampler
+from vllm.v1.worker.gpu.spec_decode import init_speculator
+from vllm.v1.worker.gpu.spec_decode.rejection_sample import (
+    get_num_rejected,
+    rejection_sample,
+)
+from vllm.v1.worker.gpu.states import RequestState
+from vllm.v1.worker.gpu.structured_outputs import apply_grammar_bitmask
+from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
+from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
+
+logger = init_logger(__name__)
+
+
+class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.cache_config = vllm_config.cache_config
+        self.compilation_config = vllm_config.compilation_config
+        self.lora_config = vllm_config.lora_config
+        self.load_config = vllm_config.load_config
+        self.parallel_config = vllm_config.parallel_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.speculative_config = vllm_config.speculative_config
+        self.observability_config = vllm_config.observability_config
+
+        self.device = device
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = self.model_config.dtype
+        self.kv_cache_dtype = self.dtype
+        if self.cache_config.cache_dtype != "auto":
+            # Quantized KV cache.
+            self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
+                self.cache_config.cache_dtype
+            ]
+        self.is_pooling_model = False
+
+        self.vocab_size = self.model_config.get_vocab_size()
+        self.max_model_len = self.model_config.max_model_len
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.inputs_embeds_size = self.model_config.get_inputs_embeds_size()
+
+        self.dp_size = self.parallel_config.data_parallel_size
+        self.dp_rank = self.parallel_config.data_parallel_rank
+
+        self.use_async_scheduling = self.scheduler_config.async_scheduling
+        self.output_copy_stream = torch.cuda.Stream(self.device)
+        self.output_copy_event = torch.cuda.Event()
+        if self.use_async_scheduling:
+            self.input_prep_event = torch.cuda.Event()
+            self.structured_outputs_event = torch.cuda.Event()
+        else:
+            self.input_prep_event = None
+            self.structured_outputs_event = None
+
+        if self.speculative_config is not None:
+            self.do_spec_decode = True
+            self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+            self.speculator = init_speculator(self.vllm_config, self.device)
+        else:
+            self.do_spec_decode = False
+            self.num_speculative_steps = 0
+            self.speculator = None
+
+        self.req_states = RequestState(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            num_speculative_steps=self.num_speculative_steps,
+            vocab_size=self.vocab_size,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+        self.input_buffers = InputBuffers(
+            max_num_reqs=self.max_num_reqs,
+            max_num_tokens=self.max_num_tokens,
+            inputs_embeds_size=self.inputs_embeds_size,
+            vocab_size=self.vocab_size,
+            dtype=self.dtype,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+        self.sampler = Sampler(logprobs_mode=self.model_config.logprobs_mode)
+
+        # CUDA graphs.
+        self.cudagraph_manager = CudaGraphManager(self.vllm_config, self.device)
+
+    def get_supported_tasks(self) -> tuple[str]:
+        return ("generate",)
+
+    def load_model(self, *args, **kwargs) -> None:
+        time_before_load = time.perf_counter()
+        with DeviceMemoryProfiler() as m:
+            model_loader = get_model_loader(self.vllm_config.load_config)
+            logger.info("Loading model from scratch...")
+
+            self.model = model_loader.load_model(
+                vllm_config=self.vllm_config,
+                model_config=self.vllm_config.model_config,
+            )
+            if self.lora_config:
+                self.model = self.load_lora_model(
+                    self.model,
+                    self.vllm_config,
+                    self.device,
+                )
+            if self.do_spec_decode:
+                self.speculator.load_model(self.model)
+        time_after_load = time.perf_counter()
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info(
+            "Model loading took %.4f GiB and %.6f seconds",
+            m.consumed_memory / GiB_bytes,
+            time_after_load - time_before_load,
+        )
+
+    def get_model(self) -> nn.Module:
+        return self.model
+
+    def get_kv_cache_spec(self):
+        return get_kv_cache_spec(self.vllm_config)
+
+    def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
+        kv_cache_config = deepcopy(kv_cache_config)
+        self.kv_cache_config = kv_cache_config
+        block_sizes = [
+            kv_cache_group.kv_cache_spec.block_size
+            for kv_cache_group in kv_cache_config.kv_cache_groups
+        ]
+
+        self.block_tables = BlockTables(
+            block_sizes=block_sizes,
+            max_num_reqs=self.max_num_reqs,
+            max_num_batched_tokens=self.max_num_tokens,
+            max_model_len=self.max_model_len,
+            device=self.device,
+            pin_memory=self.pin_memory,
+        )
+
+        self.attn_backends, self.attn_metadata_builders = init_attn_backend(
+            self.kv_cache_config,
+            self.vllm_config,
+            self.device,
+        )
+        if self.do_spec_decode:
+            # HACK(woosuk)
+            self.speculator.set_attn(
+                self.kv_cache_config,
+                self.attn_metadata_builders,
+                self.block_tables,
+            )
+
+        # TODO(woosuk): Support other backends.
+        if not all(b.get_name() == "FLASH_ATTN" for b in self.attn_backends.values()):
+            raise NotImplementedError("Only FLASH_ATTN backend is supported currently.")
+
+        self.kv_caches: list[torch.Tensor] = []
+        init_kv_cache(
+            self.kv_caches,
+            self.compilation_config.static_forward_context,
+            self.kv_cache_config,
+            self.attn_backends,
+            self.device,
+        )
+        # Attention groups are not supported.
+        self.attn_groups = []  # type: ignore
+
+    def prepare_dummy_attn_metadata(self, input_batch: InputBatch) -> None:
+        block_tables = self.block_tables.get_dummy_block_tables(input_batch.num_reqs)
+        slot_mappings = self.block_tables.get_dummy_slot_mappings(
+            input_batch.num_tokens
+        )
+        num_computed_tokens = torch.zeros(
+            input_batch.num_reqs, dtype=torch.int32, device=self.device
+        )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: input_batch.num_reqs + 1]
+        query_start_loc_cpu = query_start_loc.cpu[: input_batch.num_reqs + 1]
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=input_batch.num_reqs,
+            num_tokens=input_batch.num_tokens,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=self.input_buffers.seq_lens,
+            seq_lens_np=input_batch.seq_lens_np,
+            num_computed_tokens_cpu=num_computed_tokens,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+        input_batch.attn_metadata = attn_metadata
+
+    @torch.inference_mode()
+    def _dummy_run(
+        self,
+        num_tokens: int,
+        *args,
+        skip_attn: bool = True,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        input_batch = InputBatch.make_dummy(
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            input_buffers=self.input_buffers,
+            device=self.device,
+        )
+        if not skip_attn:
+            self.prepare_dummy_attn_metadata(input_batch)
+
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+        num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
+        with (
+            self.maybe_dummy_run_with_lora(
+                self.lora_config,
+                input_batch.num_scheduled_tokens,
+                num_sampled_tokens,
+            ),
+            set_forward_context(
+                input_batch.attn_metadata,
+                self.vllm_config,
+                num_tokens=num_tokens,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ),
+        ):
+            hidden_states = self.model(
+                input_ids=input_batch.input_ids,
+                positions=input_batch.positions,
+            )
+            sample_hidden_states = hidden_states[input_batch.logits_indices]
+        return hidden_states, sample_hidden_states
+
+    @torch.inference_mode()
+    def _dummy_sampler_run(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> None:
+        num_reqs = hidden_states.shape[0]
+        sampling_metadata = SamplingMetadata.make_dummy(
+            num_reqs=num_reqs,
+            device=self.device,
+        )
+        logits = self.model.compute_logits(hidden_states)
+        self.sampler(logits, sampling_metadata)
+
+    @torch.inference_mode()
+    def profile_run(self) -> None:
+        hidden_states, sample_hidden_states = self._dummy_run(
+            self.max_num_tokens,
+            skip_attn=True,
+        )
+        self._dummy_sampler_run(sample_hidden_states)
+        if self.do_spec_decode:
+            num_tokens_across_dp = make_num_tokens_across_dp(
+                self.dp_size, self.max_num_tokens
+            )
+            self.speculator.run_model(
+                self.max_num_tokens,
+                attn_metadata=None,
+                num_tokens_across_dp=num_tokens_across_dp,
+            )
+        torch.cuda.synchronize()
+        del hidden_states, sample_hidden_states
+        gc.collect()
+
+    def reset_mm_cache(self) -> None:
+        pass
+
+    def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
+        # SP is not supported yet.
+        return num_scheduled_tokens
+
+    @torch.inference_mode()
+    def capture_model(self) -> int:
+        if not self.cudagraph_manager.needs_capture():
+            logger.warning(
+                "Skipping CUDA graph capture. To turn on CUDA graph capture, "
+                "ensure `cudagraph_mode` was not manually set to `NONE`"
+            )
+            return 0
+
+        start_time = time.perf_counter()
+        gc.collect()
+        torch.cuda.empty_cache()
+        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        with self.maybe_setup_dummy_loras(self.lora_config):
+            self.cudagraph_manager.capture(
+                model=self.model,
+                input_buffers=self.input_buffers,
+                block_tables=self.block_tables,
+                attn_metadata_builders=self.attn_metadata_builders,
+                kv_cache_config=self.kv_cache_config,
+            )
+            if self.do_spec_decode:
+                self.speculator.capture_model()
+
+        end_time = time.perf_counter()
+        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
+        elapsed_time = end_time - start_time
+        cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
+        # This usually takes 5~20 seconds.
+        logger.info(
+            "Graph capturing finished in %.0f secs, took %.2f GiB",
+            elapsed_time,
+            cuda_graph_size / (1 << 30),
+        )
+        return cuda_graph_size
+
+    def warmup_for_prefill(self) -> None:
+        # For FlashInfer, we would like to execute a dummy prefill run
+        # to trigger JIT compilation.
+        if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()):
+            self._dummy_run(self.max_num_tokens, skip_attn=False)
+            torch.cuda.synchronize()
+
+    def update_states(self, scheduler_output: SchedulerOutput) -> None:
+        if scheduler_output.preempted_req_ids is not None:
+            for req_id in scheduler_output.preempted_req_ids:
+                self.req_states.remove_request(req_id)
+        for req_id in scheduler_output.finished_req_ids:
+            self.req_states.remove_request(req_id)
+
+        # TODO(woosuk): Change SchedulerOutput.
+        req_indices: list[int] = []
+        cu_num_new_blocks = tuple(
+            [0] for _ in range(self.block_tables.num_kv_cache_groups)
+        )
+        new_block_ids: tuple[list[int], ...] = tuple(
+            [] for _ in range(self.block_tables.num_kv_cache_groups)
+        )
+        overwrite: list[bool] = []
+
+        # Add new requests.
+        for new_req_data in scheduler_output.scheduled_new_reqs:
+            assert new_req_data.prompt_token_ids is not None
+            assert new_req_data.prefill_token_ids is not None
+            assert new_req_data.sampling_params is not None
+            req_id = new_req_data.req_id
+            self.req_states.add_request(
+                req_id=req_id,
+                prompt_len=len(new_req_data.prompt_token_ids),
+                prefill_token_ids=new_req_data.prefill_token_ids,
+                num_computed_tokens=new_req_data.num_computed_tokens,
+                sampling_params=new_req_data.sampling_params,
+                lora_request=new_req_data.lora_request,
+            )
+
+            req_index = self.req_states.req_id_to_index[req_id]
+            req_indices.append(req_index)
+            for i, block_ids in enumerate(new_req_data.block_ids):
+                x = cu_num_new_blocks[i][-1]
+                cu_num_new_blocks[i].append(x + len(block_ids))
+                new_block_ids[i].extend(block_ids)
+            overwrite.append(True)
+        if scheduler_output.scheduled_new_reqs:
+            self.req_states.prefill_len.copy_to_gpu()
+
+        # Add new blocks for the existing requests.
+        cached_reqs = scheduler_output.scheduled_cached_reqs
+        for i, req_id in enumerate(cached_reqs.req_ids):
+            req_index = self.req_states.req_id_to_index[req_id]
+
+            req_new_block_ids = cached_reqs.new_block_ids[i]
+            if req_new_block_ids is not None:
+                req_indices.append(req_index)
+                for group_id, block_ids in enumerate(req_new_block_ids):
+                    x = cu_num_new_blocks[group_id][-1]
+                    cu_num_new_blocks[group_id].append(x + len(block_ids))
+                    new_block_ids[group_id].extend(block_ids)
+                overwrite.append(False)
+
+        if req_indices:
+            self.block_tables.append_block_ids(
+                req_indices=req_indices,
+                cu_num_new_blocks=cu_num_new_blocks,
+                new_block_ids=new_block_ids,
+                overwrite=overwrite,
+            )
+
+    def prepare_inputs(
+        self,
+        scheduler_output: SchedulerOutput,
+        num_tokens_after_padding: int,
+    ) -> InputBatch:
+        num_tokens = scheduler_output.total_num_scheduled_tokens
+        assert num_tokens > 0
+        num_reqs = len(scheduler_output.num_scheduled_tokens)
+
+        # Decode first, then prefill.
+        # batch_idx -> req_id
+        req_ids = sorted(
+            scheduler_output.num_scheduled_tokens.keys(),
+            key=lambda k: scheduler_output.num_scheduled_tokens[k],
+        )
+        num_scheduled_tokens = np.array(
+            [scheduler_output.num_scheduled_tokens[i] for i in req_ids], dtype=np.int32
+        )
+
+        idx_mapping_list = [
+            self.req_states.req_id_to_index[req_id] for req_id in req_ids
+        ]
+        idx_mapping = self.input_buffers.idx_mapping
+        idx_mapping.np[:num_reqs] = idx_mapping_list
+        idx_mapping_np = idx_mapping.np[:num_reqs]
+        idx_mapping = idx_mapping.copy_to_gpu(num_reqs)
+
+        # Get the number of draft tokens for each request.
+        if not scheduler_output.scheduled_spec_decode_tokens:
+            # No draft token scheduled (common case).
+            total_num_draft_tokens = 0
+            total_num_logits = num_reqs
+            cu_num_logits = torch.arange(
+                num_reqs + 1, device=self.device, dtype=torch.int32
+            )
+        else:
+            draft_tokens = scheduler_output.scheduled_spec_decode_tokens
+            num_draft_tokens = np.array(
+                [
+                    len(draft_tokens[req_id]) if req_id in draft_tokens else 0
+                    for req_id in req_ids
+                ],
+                dtype=np.int32,
+            )
+            total_num_draft_tokens = int(num_draft_tokens.sum())
+            total_num_logits = num_reqs + total_num_draft_tokens
+
+            np.cumsum(
+                num_draft_tokens + 1,
+                out=self.input_buffers.cu_num_logits.np[1 : num_reqs + 1],
+            )
+            cu_num_logits = self.input_buffers.cu_num_logits.copy_to_gpu(num_reqs + 1)
+
+        # Block tables: num_kv_cache_groups x [num_reqs, max_num_blocks]
+        block_tables = self.block_tables.gather_block_tables(idx_mapping)
+
+        # Get query_start_loc.
+        np.cumsum(
+            num_scheduled_tokens,
+            out=self.input_buffers.query_start_loc.np[1 : num_reqs + 1],
+        )
+        # Pad for full CUDA graph mode.
+        # Some attention backends like FA3 require query_start_loc to be non-decreasing.
+        self.input_buffers.query_start_loc.np[num_reqs + 1 :] = num_tokens
+        self.input_buffers.query_start_loc.copy_to_gpu()
+        query_start_loc_gpu = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        query_start_loc_cpu = self.input_buffers.query_start_loc.cpu[: num_reqs + 1]
+        query_start_loc_np = self.input_buffers.query_start_loc.np[: num_reqs + 1]
+
+        # Get prefill tokens.
+        prepare_prefill_inputs(
+            self.input_buffers.input_ids,
+            self.req_states.next_prefill_tokens,
+            idx_mapping,
+            query_start_loc_gpu,
+            self.req_states.prefill_token_ids.gpu,
+            self.req_states.prefill_len.gpu,
+            self.req_states.num_computed_tokens,
+        )
+
+        # Prepare positions and seq_lens.
+        prepare_pos_seq_lens(
+            idx_mapping,
+            query_start_loc_gpu,
+            self.req_states.num_computed_tokens,
+            self.input_buffers.positions,
+            self.input_buffers.seq_lens,
+        )
+        seq_lens = self.input_buffers.seq_lens[:num_reqs]
+
+        # Some input token ids are directly read from the last sampled tokens
+        # and draft tokens. Also, get the logits indices to sample tokens from.
+        logits_indices = combine_sampled_and_draft_tokens(
+            self.input_buffers.input_ids,
+            idx_mapping,
+            self.req_states.last_sampled_tokens,
+            query_start_loc_gpu,
+            seq_lens,
+            self.req_states.prefill_len.gpu,
+            self.req_states.draft_tokens,
+            cu_num_logits,
+            total_num_logits,
+        )
+
+        # Compute slot mappings: [num_kv_cache_groups, num_tokens]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            query_start_loc_gpu, self.input_buffers.positions[:num_tokens]
+        )
+
+        # Get num_computed_tokens.
+        # HACK(woosuk): Here, we use num_computed_tokens on GPU instead of
+        # num_computed_tokens_cpu. This works for most cases.
+        num_computed_tokens = self.req_states.num_computed_tokens[idx_mapping]
+        # HACK(woosuk): Only GPU has the exact seq_lens because at this point
+        # CPU does not know how many draft tokens are accepted/rejected in the
+        # previous step. Therefore, we use max_model_len to be safe.
+        # NOTE(woosuk): This only works for FA3 backend.
+        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+
+        # Layer name -> attention metadata.
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=num_reqs,
+            num_tokens=num_tokens,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=self.input_buffers.seq_lens,
+            seq_lens_np=seq_lens_np,
+            num_computed_tokens_cpu=num_computed_tokens,
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+
+        input_ids = self.input_buffers.input_ids[:num_tokens_after_padding]
+        positions = self.input_buffers.positions[:num_tokens_after_padding]
+        return InputBatch(
+            req_ids=req_ids,
+            num_reqs=num_reqs,
+            idx_mapping=idx_mapping,
+            idx_mapping_np=idx_mapping_np,
+            num_scheduled_tokens=num_scheduled_tokens,
+            num_tokens=num_tokens,
+            num_tokens_after_padding=num_tokens_after_padding,
+            num_draft_tokens=total_num_draft_tokens,
+            query_start_loc=query_start_loc_gpu,
+            query_start_loc_np=query_start_loc_np,
+            seq_lens=seq_lens,
+            seq_lens_np=seq_lens_np,
+            input_ids=input_ids,
+            positions=positions,
+            attn_metadata=attn_metadata,
+            logits_indices=logits_indices,
+            cu_num_logits=cu_num_logits,
+        )
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        grammar_output: GrammarOutput | None,
+    ) -> tuple[SamplerOutput, torch.Tensor, torch.Tensor]:
+        sample_hidden_states = hidden_states[input_batch.logits_indices]
+        logits = self.model.compute_logits(sample_hidden_states)
+        if grammar_output is not None:
+            # Apply grammar bitmask to the logits in-place.
+            # TODO(woosuk): Make compatible with spec decoding.
+            assert input_batch.num_draft_tokens == 0
+            with async_barrier(self.structured_outputs_event):
+                apply_grammar_bitmask(
+                    logits,
+                    input_batch.req_ids,
+                    grammar_output.structured_output_request_ids,
+                    grammar_output.grammar_bitmask,
+                    self.input_buffers,
+                )
+
+        # Sample tokens and compute logprobs (if needed).
+        sampler_output = self.sampler(logits, sampling_metadata)
+
+        # Get the number of sampled tokens.
+        prefill_len = self.req_states.prefill_len.gpu[input_batch.idx_mapping]
+        is_chunked_prefilling = input_batch.seq_lens < prefill_len
+        if input_batch.num_draft_tokens == 0:
+            # No draft tokens (common case).
+            # 0 if chunked-prefilling, 1 if not.
+            num_sampled = (~is_chunked_prefilling).int()
+            num_rejected = torch.zeros_like(num_sampled)
+        else:
+            # Draft tokens for spec decoding.
+            input_ids = input_batch.input_ids[input_batch.logits_indices]
+            sampled_tokens, num_sampled = rejection_sample(
+                sampler_output.sampled_token_ids,
+                input_ids,
+                input_batch.cu_num_logits,
+                self.num_speculative_steps,
+            )
+            num_sampled *= ~is_chunked_prefilling
+            num_rejected = get_num_rejected(
+                input_batch.cu_num_logits,
+                num_sampled,
+            )
+            sampler_output.sampled_token_ids = sampled_tokens
+            # TODO(woosuk): Support logprobs with spec decoding.
+        return sampler_output, num_sampled, num_rejected
+
+    def compute_prompt_logprobs(
+        self,
+        hidden_states: torch.Tensor,
+        input_batch: InputBatch,
+    ) -> dict[str, LogprobsTensors]:
+        idx_mapping_np = input_batch.idx_mapping_np
+        needs_prompt_logprobs = self.req_states.needs_prompt_logprobs[idx_mapping_np]
+        if not np.any(needs_prompt_logprobs):
+            # No request asks for prompt logprobs.
+            return {}
+
+        prompt_lens = self.req_states.prompt_len[idx_mapping_np]
+        # NOTE(woosuk): -1 because the last prompt token's hidden state is not
+        # needed for prompt logprobs.
+        computed_prefill = self.req_states.num_computed_prefill_tokens[idx_mapping_np]
+        includes_prompt = computed_prefill < prompt_lens - 1
+        # NOTE(woosuk): If the request was resumed after preemption, its prompt
+        # logprobs must have been computed before preemption. Skip.
+        resumed_after_prompt = (
+            prompt_lens < self.req_states.prefill_len.np[idx_mapping_np]
+        )
+        needs_prompt_logprobs &= includes_prompt & ~resumed_after_prompt
+        if not np.any(needs_prompt_logprobs):
+            return {}
+
+        # Just to be safe, clone the input ids.
+        n = input_batch.num_tokens
+        # Shift the input ids by one.
+        token_ids = torch.empty_like(input_batch.input_ids[:n])
+        token_ids[: n - 1] = input_batch.input_ids[1:n]
+        # To avoid out-of-bound access, set the last token id to 0.
+        token_ids[n - 1] = 0
+
+        # Handle chunked prompts.
+        pos_after_step = computed_prefill + input_batch.num_scheduled_tokens
+        is_prompt_chunked = pos_after_step < prompt_lens
+        prefill_token_ids = self.req_states.prefill_token_ids.np
+        query_start_loc = self.input_buffers.query_start_loc.np
+        for i, req_id in enumerate(input_batch.req_ids):
+            if not needs_prompt_logprobs[i]:
+                continue
+            if not is_prompt_chunked[i]:
+                continue
+            # The prompt is chunked. Get the next prompt token.
+            req_idx = input_batch.idx_mapping_np[i]
+            next_prompt_token = int(prefill_token_ids[req_idx, pos_after_step[i]])
+            idx = int(query_start_loc[i + 1] - 1)
+            # Set the next prompt token.
+            # NOTE(woosuk): This triggers a GPU operation.
+            token_ids[idx] = next_prompt_token
+
+        # NOTE(woosuk): We mask out logprobs for negative tokens.
+        prompt_logprobs, prompt_ranks = compute_prompt_logprobs(
+            token_ids,
+            hidden_states[:n],
+            self.model.compute_logits,
+        )
+
+        prompt_token_ids = token_ids.unsqueeze(-1)
+        prompt_logprobs_dict: dict[str, LogprobsTensors] = {}
+        for i, req_id in enumerate(input_batch.req_ids):
+            if not needs_prompt_logprobs[i]:
+                continue
+
+            start_idx = query_start_loc[i]
+            end_idx = query_start_loc[i + 1]
+            assert start_idx < end_idx, (
+                f"start_idx ({start_idx}) >= end_idx ({end_idx})"
+            )
+            logprobs = LogprobsTensors(
+                logprob_token_ids=prompt_token_ids[start_idx:end_idx],
+                logprobs=prompt_logprobs[start_idx:end_idx],
+                selected_token_ranks=prompt_ranks[start_idx:end_idx],
+            )
+
+            req_extra_data = self.req_states.extra_data[req_id]
+            prompt_logprobs_list = req_extra_data.in_progress_prompt_logprobs
+            if is_prompt_chunked[i]:
+                # Prompt is chunked. Do not return the logprobs yet.
+                prompt_logprobs_list.append(logprobs)
+                continue
+
+            if prompt_logprobs_list:
+                # Merge the in-progress logprobs.
+                prompt_logprobs_list.append(logprobs)
+                logprobs = LogprobsTensors(
+                    logprob_token_ids=torch.cat(
+                        [x.logprob_token_ids for x in prompt_logprobs_list]
+                    ),
+                    logprobs=torch.cat([x.logprobs for x in prompt_logprobs_list]),
+                    selected_token_ranks=torch.cat(
+                        [x.selected_token_ranks for x in prompt_logprobs_list]
+                    ),
+                )
+                prompt_logprobs_list.clear()
+
+            prompt_logprobs_dict[req_id] = logprobs
+        return prompt_logprobs_dict
+
+    def postprocess(
+        self,
+        input_batch: InputBatch,
+        sampled_tokens: torch.Tensor,
+        num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
+    ) -> None:
+        # Update the number of computed tokens.
+        post_update(
+            input_batch.idx_mapping,
+            self.req_states.num_computed_tokens,
+            self.req_states.last_sampled_tokens,
+            self.req_states.output_bin_counts,
+            sampled_tokens,
+            num_sampled,
+            num_rejected,
+            input_batch.query_start_loc,
+        )
+
+        # Update the number of computed prefill tokens.
+        idx_mapping_np = input_batch.idx_mapping_np
+        computed_prefill = self.req_states.num_computed_prefill_tokens
+        # TODO(woosuk): Simplify this.
+        computed_prefill[idx_mapping_np] = np.minimum(
+            computed_prefill[idx_mapping_np] + input_batch.num_scheduled_tokens,
+            self.req_states.prefill_len.np[idx_mapping_np],
+        )
+
+    @torch.inference_mode()
+    def propose_draft(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        last_hidden_states: torch.Tensor,
+        aux_hidden_states: list[torch.Tensor] | None,
+        num_sampled: torch.Tensor,
+        num_rejected: torch.Tensor,
+    ) -> torch.Tensor:
+        assert self.speculator is not None
+        last_sampled_tokens = self.req_states.last_sampled_tokens[
+            input_batch.idx_mapping
+        ]
+        next_prefill_tokens = self.req_states.next_prefill_tokens[
+            input_batch.idx_mapping
+        ]
+        draft_tokens = self.speculator.propose(
+            input_batch,
+            sampling_metadata,
+            last_hidden_states,
+            aux_hidden_states,
+            num_sampled,
+            num_rejected,
+            last_sampled_tokens,
+            next_prefill_tokens,
+        )
+        return draft_tokens
+
+    def get_cudagraph_and_dp_padding(
+        self,
+        scheduler_output: SchedulerOutput,
+    ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]:
+        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+        if self.dp_size == 1:
+            # No DP. Only consider CUDA graphs.
+            if total_num_scheduled_tokens == 0:
+                # Special case: no tokens to run.
+                return CUDAGraphMode.NONE, 0, None
+
+            cudagraph_size = self.cudagraph_manager.get_cudagraph_size(
+                scheduler_output, total_num_scheduled_tokens
+            )
+            if cudagraph_size is not None:
+                # Use full CUDA graph.
+                return CUDAGraphMode.FULL, cudagraph_size, None
+            # Fall back to eager mode.
+            # TODO(woosuk): Support piecewise CUDA graphs.
+            return CUDAGraphMode.NONE, total_num_scheduled_tokens, None
+
+        # Consider DP padding and CUDA graph.
+        if total_num_scheduled_tokens == 0:
+            # Special handling is needed for 0.
+            cudagraph_size_before_dp: int | None = 0
+        else:
+            cudagraph_size_before_dp = self.cudagraph_manager.get_cudagraph_size(
+                scheduler_output, total_num_scheduled_tokens
+            )
+            if cudagraph_size_before_dp is None:
+                cudagraph_size_before_dp = -1
+
+        assert cudagraph_size_before_dp is not None
+        num_tokens_across_dp, cudagraph_size_across_dp = get_batch_metadata_across_dp(
+            total_num_scheduled_tokens,
+            cudagraph_size_before_dp,
+            self.dp_size,
+            self.dp_rank,
+        )
+        if all(cudagraph_size_across_dp >= 0):
+            # If all ranks can use CUDA graph, pad to the maximum number of tokens
+            # across DP and use CUDA graph.
+            num_tokens_after_padding = int(cudagraph_size_across_dp.max().item())
+            cudagraph_mode = CUDAGraphMode.FULL
+        else:
+            # If any of the ranks cannot use CUDA graph, use eager mode for all ranks.
+            # No padding is needed except for ranks that have no tokens to run.
+            num_tokens_across_dp = torch.clamp(num_tokens_across_dp, min=1)
+            num_tokens_after_padding = num_tokens_across_dp[self.dp_rank]
+            cudagraph_mode = CUDAGraphMode.NONE
+        return cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        scheduler_output: SchedulerOutput,
+        intermediate_tensors: Any | None = None,
+        dummy_run: bool = False,
+    ) -> ModelRunnerOutput | None:
+        assert intermediate_tensors is None
+        if scheduler_output.total_num_scheduled_tokens == 0 and not dummy_run:
+            # No need to run the model.
+            with async_barrier(self.input_prep_event):
+                self.update_states(scheduler_output)
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+        # NOTE: Call this before the async barrier so CPU all-reduce and
+        # GPU execution can overlap.
+        cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = (
+            self.get_cudagraph_and_dp_padding(scheduler_output)
+        )
+        with async_barrier(self.input_prep_event):
+            self.update_states(scheduler_output)
+            if num_tokens_after_padding == 0:
+                # All DP ranks have zero tokens to run.
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            if not dummy_run:
+                # Common case.
+                # Prepare all the inputs and copy to the input buffers.
+                input_batch = self.prepare_inputs(
+                    scheduler_output,
+                    num_tokens_after_padding,
+                )
+
+                # NOTE(woosuk): Sampling metadata should be built under the async
+                # barrier to avoid race conditions.
+                pos = input_batch.positions[input_batch.logits_indices]
+                sampling_metadata = self.req_states.make_sampling_metadata(
+                    input_batch.idx_mapping, input_batch.idx_mapping_np, pos
+                )
+                if input_batch.num_draft_tokens > 0:
+                    sampling_metadata = expand_sampling_metadata(
+                        sampling_metadata,
+                        input_batch.cu_num_logits,
+                        max_expand_len=self.num_speculative_steps + 1,
+                    )
+
+                if self.lora_config:
+                    # Activate LoRA adapters.
+                    lora_inputs = self.req_states.make_lora_inputs(
+                        input_batch.req_ids,
+                        input_batch.idx_mapping_np,
+                        input_batch.num_scheduled_tokens,
+                    )
+                    self._set_active_loras(*lora_inputs)
+            else:
+                # No actual tokens to run. A dummy run for DP.
+                num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
+                input_batch = InputBatch.make_dummy(
+                    num_reqs=num_reqs,
+                    num_tokens=num_tokens_after_padding,
+                    input_buffers=self.input_buffers,
+                    device=self.device,
+                )
+                self.prepare_dummy_attn_metadata(input_batch)
+                sampling_metadata = None
+
+        # Run model.
+        if cudagraph_mode == CUDAGraphMode.FULL:
+            # Run CUDA graph.
+            # NOTE(woosuk): Here, we don't need to pass the input tensors,
+            # because they are already copied to the CUDA graph input buffers.
+            hidden_states = self.cudagraph_manager.run(
+                input_batch.num_tokens_after_padding
+            )
+        else:
+            # Run PyTorch model in eager mode.
+            # TODO(woosuk): Support piecewise CUDA graph.
+            with set_forward_context(
+                input_batch.attn_metadata,
+                self.vllm_config,
+                num_tokens=input_batch.num_tokens_after_padding,
+                cudagraph_runtime_mode=cudagraph_mode,
+                num_tokens_across_dp=num_tokens_across_dp,
+            ):
+                hidden_states = self.model(
+                    input_ids=input_batch.input_ids,
+                    positions=input_batch.positions,
+                )
+
+        self.execute_model_state = hidden_states, input_batch, sampling_metadata
+        return None
+
+    @torch.inference_mode()
+    def sample_tokens(
+        self,
+        grammar_output: GrammarOutput | None,
+    ) -> AsyncOutput | ModelRunnerOutput:
+        assert self.execute_model_state is not None
+        hidden_states, input_batch, sampling_metadata = self.execute_model_state
+        self.execute_model_state = None  # type: ignore
+        assert sampling_metadata is not None
+
+        sampler_output, num_sampled, num_rejected = self.sample(
+            hidden_states, input_batch, sampling_metadata, grammar_output
+        )
+        prompt_logprobs_dict = self.compute_prompt_logprobs(hidden_states, input_batch)
+
+        # Prepare the model runner output.
+        model_runner_output = ModelRunnerOutput(
+            req_ids=input_batch.req_ids,
+            # NOTE(woosuk): req_id_to_index is unused in this model runner.
+            # Only for compatibility with the existing model runner and scheduler.
+            req_id_to_index={req_id: i for i, req_id in enumerate(input_batch.req_ids)},
+            sampled_token_ids=None,  # type: ignore
+            logprobs=None,
+            prompt_logprobs_dict=prompt_logprobs_dict,  # type: ignore
+            pooler_output=[],
+            kv_connector_output=None,
+            num_nans_in_logits=None,
+        )
+        async_output = AsyncOutput(
+            model_runner_output=model_runner_output,
+            sampler_output=sampler_output,
+            num_sampled_tokens=num_sampled,
+            copy_stream=self.output_copy_stream,
+            copy_event=self.output_copy_event,
+        )
+
+        # Postprocess results and update request states.
+        # NOTE: This is intentionally done after creating the AsyncOutput,
+        # ensuring that `copy_event` is recorded before calling postprocess.
+        # This sequencing may slightly reduce latency as async D2H copy does not
+        # need to wait for the postprocess to finish.
+        self.postprocess(
+            input_batch, sampler_output.sampled_token_ids, num_sampled, num_rejected
+        )
+        if self.do_spec_decode:
+            draft_tokens = self.propose_draft(
+                input_batch,
+                sampling_metadata,
+                hidden_states,
+                None,  # aux_hidden_states
+                num_sampled,
+                num_rejected,
+            )
+            self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
+
+        if self.use_async_scheduling:
+            return async_output
+        return async_output.get_output()
diff --git a/vllm/v1/worker/gpu/sample/__init__.py b/vllm/v1/worker/gpu/sample/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/v1/worker/gpu/sample/gumbel.py b/vllm/v1/worker/gpu/sample/gumbel.py
new file mode 100644
index 000000000000..a95bf1e7a37a
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/gumbel.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _gumbel_sample_kernel(
+    local_argmax_ptr,
+    local_argmax_stride,
+    local_max_ptr,
+    local_max_stride,
+    logits_ptr,
+    logits_stride,
+    seeds_ptr,
+    pos_ptr,
+    temp_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    APPLY_TEMPERATURE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(
+        logits_ptr + req_idx * logits_stride + block,
+        mask=mask,
+        other=float("-inf"),
+    )
+    logits = logits.to(tl.float32)
+
+    temp = tl.load(temp_ptr + req_idx).to(tl.float32)
+    if temp != 0.0:
+        # Calculate the seed for gumbel noise.
+        seed = tl.load(seeds_ptr + req_idx)
+        pos = tl.load(pos_ptr + req_idx)
+        gumbel_seed = tl.randint(seed, pos)
+
+        # Generate gumbel noise.
+        r = tl.rand(gumbel_seed, block).to(tl.float64)
+        gumbel_noise = -tl.log(-tl.log(r + 1e-20) + 1e-20)
+        gumbel_noise = gumbel_noise.to(tl.float32)
+
+        # Apply temperature.
+        if APPLY_TEMPERATURE:
+            # NOTE(woosuk): Match the behavior of _penalties_and_temperature_kernel.
+            # E.g., if the kernel uses tl.div_rn, we should use tl.div_rn here too.
+            logits = logits / temp
+
+        # Apply gumbel noise.
+        logits = tl.where(mask, logits + gumbel_noise, float("-inf"))
+
+    idx = tl.argmax(logits, axis=0)
+    token_id = block_idx * BLOCK_SIZE + idx
+    value = tl.max(logits, axis=0)
+    tl.store(local_argmax_ptr + req_idx * local_argmax_stride + block_idx, token_id)
+    tl.store(local_max_ptr + req_idx * local_max_stride + block_idx, value)
+
+
+def gumbel_sample(
+    logits: torch.Tensor,  # [num_reqs, vocab_size]
+    temperature: torch.Tensor,  # [num_reqs]
+    seed: torch.Tensor,  # [num_reqs]
+    pos: torch.Tensor,  # [num_reqs]
+    apply_temperature: bool,
+) -> torch.Tensor:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    local_argmax = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    local_max = torch.empty(
+        num_reqs,
+        num_blocks,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _gumbel_sample_kernel[(num_reqs, num_blocks)](
+        local_argmax,
+        local_argmax.stride(0),
+        local_max,
+        local_max.stride(0),
+        logits,
+        logits.stride(0),
+        seed,
+        pos,
+        temperature,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+        APPLY_TEMPERATURE=apply_temperature,
+    )
+    # NOTE(woosuk): Use int64 for later indexing.
+    max_block_idx = local_max.argmax(dim=-1, keepdim=True)
+    sampled = local_argmax.gather(dim=-1, index=max_block_idx).view(-1)
+    return sampled
diff --git a/vllm/v1/worker/gpu/sample/logprob.py b/vllm/v1/worker/gpu/sample/logprob.py
new file mode 100644
index 000000000000..25448b387b31
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/logprob.py
@@ -0,0 +1,167 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.outputs import LogprobsTensors
+
+
+@triton.jit
+def _topk_log_softmax_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    topk_ids_ptr,
+    topk,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+    PADDED_TOPK: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    max_val = float("-inf")
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        max_val = tl.max(tl.maximum(logits, max_val))
+    max_val = max_val.to(tl.float32)  # type: ignore
+
+    se = 0.0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=0.0)
+        # NOTE(woosuk): Make sure that logits and all following operations use FP32.
+        logits = logits.to(tl.float32)
+        e = tl.exp(logits - max_val)
+        e = tl.where(block < vocab_size, e, 0.0)
+        se += tl.sum(e)
+    lse = tl.log(se)
+
+    k_offset = tl.arange(0, PADDED_TOPK)
+    k_mask = k_offset < topk
+    topk_ids = tl.load(topk_ids_ptr + req_idx * topk + k_offset, mask=k_mask, other=0)
+
+    logits = tl.load(row_ptr + topk_ids, mask=k_mask)
+    logits = logits.to(tl.float32)
+    o = logits - max_val - lse
+    tl.store(output_ptr + req_idx * topk + k_offset, o, mask=k_mask)
+
+
+@triton.jit
+def _ranks_kernel(
+    output_ptr,
+    logits_ptr,
+    logits_stride,
+    token_ids_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    row_ptr = logits_ptr + req_idx * logits_stride
+
+    token_id = tl.load(token_ids_ptr + req_idx)
+    x = tl.load(row_ptr + token_id)
+
+    n = 0
+    for i in range(0, vocab_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        logits = tl.load(row_ptr + block, mask=block < vocab_size, other=float("-inf"))
+        n += tl.sum((logits > x).to(tl.int32))
+    tl.store(output_ptr + req_idx, n)
+
+
+def compute_token_logprobs(
+    logits: torch.Tensor,
+    token_ids: torch.Tensor,
+) -> torch.Tensor:
+    batch_size = logits.shape[0]
+    vocab_size = logits.shape[1]
+    token_ids = token_ids.to(torch.int64)
+    num_logprobs = token_ids.shape[1]
+    logprobs = torch.empty(
+        batch_size,
+        num_logprobs,
+        dtype=torch.float32,
+        device=logits.device,
+    )
+    _topk_log_softmax_kernel[(batch_size,)](
+        logprobs,
+        logits,
+        logits.stride(0),
+        token_ids,
+        num_logprobs,
+        vocab_size,
+        BLOCK_SIZE=1024,  # type: ignore
+        PADDED_TOPK=triton.next_power_of_2(num_logprobs),
+    )
+    return logprobs
+
+
+def compute_topk_logprobs(
+    logits: torch.Tensor,
+    num_logprobs: int,
+    sampled_token_ids: torch.Tensor,
+) -> LogprobsTensors:
+    assert num_logprobs >= 0
+    batch_size, vocab_size = logits.shape
+    if num_logprobs == 0:
+        logprob_token_ids = sampled_token_ids.unsqueeze(-1)
+    else:
+        topk_indices = torch.topk(logits, num_logprobs, dim=-1).indices
+        logprob_token_ids = torch.cat(
+            (sampled_token_ids.unsqueeze(-1), topk_indices), dim=1
+        )
+
+    # NOTE(woosuk): Here, to save GPU memory, we do not materialize the full
+    # logprobs tensor. Instead, we only compute and return the logprobs of
+    # the topk + 1 tokens.
+    logprobs = compute_token_logprobs(logits, logprob_token_ids)
+    token_ranks = torch.empty(
+        batch_size,
+        dtype=torch.int64,
+        device=logits.device,
+    )
+    _ranks_kernel[(batch_size,)](
+        token_ranks,
+        logits,
+        logits.stride(0),
+        sampled_token_ids,
+        vocab_size,
+        BLOCK_SIZE=8192,  # type: ignore
+    )
+    return LogprobsTensors(
+        logprob_token_ids=logprob_token_ids,
+        logprobs=logprobs,
+        selected_token_ranks=token_ranks,
+    )
+
+
+def compute_prompt_logprobs(
+    prompt_token_ids: torch.Tensor,
+    prompt_hidden_states: torch.Tensor,
+    logits_fn: Callable[[torch.Tensor], torch.Tensor],
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # Since materializing the full prompt logits can take too much memory,
+    # we compute it in chunks.
+    CHUNK_SIZE = 1024
+    logprobs = []
+    ranks = []
+    prompt_token_ids = prompt_token_ids.to(torch.int64)
+    for start_idx in range(0, prompt_token_ids.shape[0], CHUNK_SIZE):
+        end_idx = start_idx + CHUNK_SIZE
+        # NOTE(woosuk): logits_fn can be slow because it involves all-gather.
+        prompt_logits = logits_fn(prompt_hidden_states[start_idx:end_idx])
+        prompt_logprobs = compute_topk_logprobs(
+            prompt_logits,
+            0,  # num_logprobs
+            prompt_token_ids[start_idx:end_idx],
+        )
+        logprobs.append(prompt_logprobs.logprobs)
+        ranks.append(prompt_logprobs.selected_token_ranks)
+
+    logprobs = torch.cat(logprobs, dim=0) if len(logprobs) > 1 else logprobs[0]
+    ranks = torch.cat(ranks, dim=0) if len(ranks) > 1 else ranks[0]
+    return logprobs, ranks
diff --git a/vllm/v1/worker/gpu/sample/metadata.py b/vllm/v1/worker/gpu/sample/metadata.py
new file mode 100644
index 000000000000..040771c051bb
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/metadata.py
@@ -0,0 +1,179 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@dataclass
+class SamplingMetadata:
+    temperature: torch.Tensor
+
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
+
+    repetition_penalty: torch.Tensor
+    frequency_penalty: torch.Tensor
+    presence_penalty: torch.Tensor
+
+    seeds: torch.Tensor
+    pos: torch.Tensor
+
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: int | None
+
+    # For penalties
+    idx_mapping: torch.Tensor
+    prompt_bin_mask: torch.Tensor
+    output_bin_counts: torch.Tensor
+
+    @classmethod
+    def make_dummy(
+        cls,
+        num_reqs: int,
+        device: torch.device,
+    ) -> "SamplingMetadata":
+        assert num_reqs > 0
+        temperature = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        temperature[0] = 0.5
+        # TODO(woosuk): Use top-p and top-k for dummy sampler.
+        # Currently, they are disabled because of memory usage.
+        # top_p = torch.full((num_reqs,), 0.95, dtype=torch.float32, device=device)
+        # top_k = torch.full((num_reqs,), 20, dtype=torch.int32, device=device)
+        top_p = None
+        top_k = None
+        # NOTE(woosuk): We must set penalties to their default values to make sure
+        # the penalties kernel does not touch the placeholder bin_counts tensors.
+        repetition_penalty = torch.ones(num_reqs, dtype=torch.float32, device=device)
+        frequency_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        presence_penalty = torch.zeros(num_reqs, dtype=torch.float32, device=device)
+        seeds = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        pos = torch.zeros(num_reqs, dtype=torch.int64, device=device)
+        max_num_logprobs = 20
+
+        idx_mapping = torch.arange(num_reqs, dtype=torch.int32, device=device)
+        # NOTE(woosuk): These are placeholder tensors to avoid None checks in the
+        # penalties kernel. We use 2 instead of 1 as vocab_size to avoid Triton
+        # specialization and re-compilation at runtime.
+        prompt_bin_mask = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+        output_bin_counts = torch.zeros(num_reqs, 2, dtype=torch.int32, device=device)
+
+        return cls(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=repetition_penalty,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_mask=prompt_bin_mask,
+            output_bin_counts=output_bin_counts,
+        )
+
+
+# NOTE(woosuk): Re-compilation can happen at runtime since top_p and top_k can be None.
+@triton.jit
+def _expand_sampling_metadata_kernel(
+    temp_ptr,
+    expanded_temp_ptr,
+    top_p_ptr,
+    expanded_top_p_ptr,
+    top_k_ptr,
+    expanded_top_k_ptr,
+    rep_penalty_ptr,
+    expanded_rep_penalty_ptr,
+    freq_penalty_ptr,
+    expanded_freq_penalty_ptr,
+    pres_penalty_ptr,
+    expanded_pres_penalty_ptr,
+    seeds_ptr,
+    expanded_seeds_ptr,
+    cu_num_logits_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    block = tl.arange(0, BLOCK_SIZE)
+    mask = block < num_tokens
+
+    temp = tl.load(temp_ptr + req_idx)
+    tl.store(expanded_temp_ptr + start_idx + block, temp, mask=mask)
+
+    if top_p_ptr is not None:
+        top_p = tl.load(top_p_ptr + req_idx)
+        tl.store(expanded_top_p_ptr + start_idx + block, top_p, mask=mask)
+
+    if top_k_ptr is not None:
+        top_k = tl.load(top_k_ptr + req_idx)
+        tl.store(expanded_top_k_ptr + start_idx + block, top_k, mask=mask)
+
+    rep_penalty = tl.load(rep_penalty_ptr + req_idx)
+    tl.store(expanded_rep_penalty_ptr + start_idx + block, rep_penalty, mask=mask)
+
+    freq_penalty = tl.load(freq_penalty_ptr + req_idx)
+    tl.store(expanded_freq_penalty_ptr + start_idx + block, freq_penalty, mask=mask)
+
+    pres_penalty = tl.load(pres_penalty_ptr + req_idx)
+    tl.store(expanded_pres_penalty_ptr + start_idx + block, pres_penalty, mask=mask)
+
+    seed = tl.load(seeds_ptr + req_idx)
+    tl.store(expanded_seeds_ptr + start_idx + block, seed, mask=mask)
+
+
+def expand_sampling_metadata(
+    sampling_metadata: SamplingMetadata,
+    cu_num_logits: torch.Tensor,
+    max_expand_len: int,
+) -> SamplingMetadata:
+    total_num_logits = sampling_metadata.pos.shape[0]
+    create_empty = lambda x: x.new_empty(total_num_logits) if x is not None else None
+    expanded_temp = create_empty(sampling_metadata.temperature)
+    expanded_top_p = create_empty(sampling_metadata.top_p)
+    expanded_top_k = create_empty(sampling_metadata.top_k)
+    expanded_repetition_penalty = create_empty(sampling_metadata.repetition_penalty)
+    expanded_frequency_penalty = create_empty(sampling_metadata.frequency_penalty)
+    expanded_presence_penalty = create_empty(sampling_metadata.presence_penalty)
+    expanded_seeds = create_empty(sampling_metadata.seeds)
+
+    num_reqs = cu_num_logits.shape[0] - 1
+    _expand_sampling_metadata_kernel[(num_reqs,)](
+        sampling_metadata.temperature,
+        expanded_temp,
+        sampling_metadata.top_p,
+        expanded_top_p,
+        sampling_metadata.top_k,
+        expanded_top_k,
+        sampling_metadata.repetition_penalty,
+        expanded_repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        expanded_frequency_penalty,
+        sampling_metadata.presence_penalty,
+        expanded_presence_penalty,
+        sampling_metadata.seeds,
+        expanded_seeds,
+        cu_num_logits,
+        BLOCK_SIZE=triton.next_power_of_2(max_expand_len),
+    )
+    return SamplingMetadata(
+        temperature=expanded_temp,
+        top_p=expanded_top_p,
+        top_k=expanded_top_k,
+        seeds=expanded_seeds,
+        repetition_penalty=expanded_repetition_penalty,
+        frequency_penalty=expanded_frequency_penalty,
+        presence_penalty=expanded_presence_penalty,
+        pos=sampling_metadata.pos,
+        max_num_logprobs=sampling_metadata.max_num_logprobs,
+        # TODO(woosuk): Support penalties with spec decoding.
+        idx_mapping=sampling_metadata.idx_mapping,
+        prompt_bin_mask=sampling_metadata.prompt_bin_mask,
+        output_bin_counts=sampling_metadata.output_bin_counts,
+    )
diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py
new file mode 100644
index 000000000000..c8d4b7d81841
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/penalties.py
@@ -0,0 +1,154 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+
+
+@triton.jit
+def _penalties_and_temperature_kernel(
+    logits_ptr,
+    logits_stride,
+    repetition_penalty_ptr,
+    frequency_penalty_ptr,
+    presence_penalty_ptr,
+    temperature_ptr,
+    idx_mapping_ptr,
+    prompt_bin_mask_ptr,
+    prompt_bin_mask_stride,
+    output_bin_counts_ptr,
+    output_bin_counts_stride,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    rep_penalty = tl.load(repetition_penalty_ptr + batch_idx)
+    freq_penalty = tl.load(frequency_penalty_ptr + batch_idx)
+    pres_penalty = tl.load(presence_penalty_ptr + batch_idx)
+    temperature = tl.load(temperature_ptr + batch_idx)
+    temperature = tl.where(temperature == 0.0, 1.0, temperature)
+
+    use_rep_penalty = rep_penalty != 1.0
+    use_freq_penalty = freq_penalty != 0.0
+    use_pres_penalty = pres_penalty != 0.0
+    use_penalty = use_rep_penalty or use_freq_penalty or use_pres_penalty
+    use_temperature = temperature != 1.0
+    if not (use_penalty or use_temperature):
+        # Early return to avoid loading logits.
+        return
+
+    block_idx = tl.program_id(1)
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = block < vocab_size
+    logits = tl.load(logits_ptr + batch_idx * logits_stride + block, mask=mask)
+    logits = logits.to(tl.float32)
+
+    if use_penalty:
+        req_state_idx = tl.load(idx_mapping_ptr + batch_idx)
+        output_bin_counts = tl.load(
+            output_bin_counts_ptr + req_state_idx * output_bin_counts_stride + block,
+            mask=mask,
+        )
+        output_bin_mask = output_bin_counts > 0
+
+        # Apply repetition penalties.
+        if use_rep_penalty:
+            packed_block = block_idx * BLOCK_SIZE // 32 + tl.arange(0, BLOCK_SIZE // 32)
+            packed_mask = tl.load(
+                prompt_bin_mask_ptr
+                + req_state_idx * prompt_bin_mask_stride
+                + packed_block,
+                mask=packed_block < tl.cdiv(vocab_size, 32),
+            )
+            prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1
+            prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE)
+
+            # If token appears in prompt or output, apply, otherwise use 1.0 for no-op.
+            scale = tl.where(prompt_bin_mask | output_bin_mask, rep_penalty, 1.0)
+            # If logits are positive, divide by penalty, otherwise multiply by penalty.
+            logits *= tl.where(logits > 0, 1.0 / scale, scale)
+
+        # Apply frequency penalties.
+        logits -= freq_penalty * output_bin_counts
+        # Apply presence penalties.
+        logits -= pres_penalty * output_bin_mask
+
+    # Apply temperature.
+    logits = logits / temperature
+
+    # Store back to logits.
+    tl.store(logits_ptr + batch_idx * logits_stride + block, logits, mask=mask)
+
+
+def apply_penalties_and_temperature(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> None:
+    num_reqs, vocab_size = logits.shape
+    BLOCK_SIZE = 8192
+    num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+    _penalties_and_temperature_kernel[(num_reqs, num_blocks)](
+        logits,
+        logits.stride(0),
+        sampling_metadata.repetition_penalty,
+        sampling_metadata.frequency_penalty,
+        sampling_metadata.presence_penalty,
+        sampling_metadata.temperature,
+        sampling_metadata.idx_mapping,
+        sampling_metadata.prompt_bin_mask,
+        sampling_metadata.prompt_bin_mask.stride(0),
+        sampling_metadata.output_bin_counts,
+        sampling_metadata.output_bin_counts.stride(0),
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+
+@triton.jit(do_not_specialize=["prefill_len", "prompt_len"])
+def _bincount_kernel(
+    prefill_token_ids_ptr,
+    prefill_len,
+    prompt_len,
+    prompt_bin_mask_ptr,
+    output_bin_counts_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    block_idx = tl.program_id(0)
+    if block_idx * BLOCK_SIZE >= prefill_len:
+        return
+
+    block = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    if block_idx * BLOCK_SIZE < prompt_len:
+        mask = block < prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        idx = prefill_tokens // 32
+        bit_idx = prefill_tokens % 32
+        bit = tl.full((BLOCK_SIZE,), 1, tl.int32) << bit_idx
+        tl.atomic_or(prompt_bin_mask_ptr + idx, bit, mask=mask)
+    if (block_idx + 1) * BLOCK_SIZE >= prompt_len:
+        mask = block < prefill_len
+        mask &= block >= prompt_len
+        prefill_tokens = tl.load(prefill_token_ids_ptr + block, mask=mask)
+        tl.atomic_add(output_bin_counts_ptr + prefill_tokens, 1, mask=mask)
+
+
+def bincount(
+    prefill_token_ids: torch.Tensor,
+    prefill_len: int,
+    prompt_len: int,
+    prompt_bin_mask: torch.Tensor,
+    output_bin_counts: torch.Tensor,
+) -> None:
+    prompt_bin_mask.zero_()
+    output_bin_counts.zero_()
+    BLOCK_SIZE = 1024
+    num_blocks = triton.cdiv(prefill_len, BLOCK_SIZE)
+    _bincount_kernel[(num_blocks,)](
+        prefill_token_ids,
+        prefill_len,
+        prompt_len,
+        prompt_bin_mask,
+        output_bin_counts,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py
new file mode 100644
index 000000000000..3429dd3e4d0f
--- /dev/null
+++ b/vllm/v1/worker/gpu/sample/sampler.py
@@ -0,0 +1,75 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm.config.model import LogprobsMode
+from vllm.v1.outputs import SamplerOutput
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
+from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature
+
+
+class Sampler:
+    def __init__(
+        self,
+        logprobs_mode: LogprobsMode = "raw_logprobs",
+    ):
+        if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]:
+            raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}")
+        self.logprobs_mode = logprobs_mode
+
+    def __call__(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        sampled, processed_logits = self.sample(logits, sampling_metadata)
+        if sampling_metadata.max_num_logprobs is not None:
+            logits = (
+                processed_logits
+                if self.logprobs_mode == "processed_logprobs"
+                else logits
+            )
+            logprobs_tensors = compute_topk_logprobs(
+                logits,
+                sampling_metadata.max_num_logprobs,
+                sampled,
+            )
+        else:
+            logprobs_tensors = None
+
+        # These are GPU tensors.
+        sampler_output = SamplerOutput(
+            # The sampled tokens are expanded to 2D tensor with shape
+            # [num_requests, 1], where each row represents one generated
+            # token per request.
+            sampled_token_ids=sampled.view(-1, 1),
+            logprobs_tensors=logprobs_tensors,
+        )
+        return sampler_output
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Copy logits to a new FP32 tensor.
+        logits = torch.empty_like(logits, dtype=torch.float32).copy_(logits)
+
+        # Apply penalties and temperature in place.
+        apply_penalties_and_temperature(logits, sampling_metadata)
+        logits = apply_top_k_top_p(
+            logits, sampling_metadata.top_k, sampling_metadata.top_p
+        )
+
+        sampled = gumbel_sample(
+            logits,
+            sampling_metadata.temperature,
+            sampling_metadata.seeds,
+            sampling_metadata.pos,
+            apply_temperature=False,
+        )
+        return sampled, logits
diff --git a/vllm/v1/worker/gpu/spec_decode/__init__.py b/vllm/v1/worker/gpu/spec_decode/__init__.py
new file mode 100644
index 000000000000..15b85204e05c
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.config import VllmConfig
+
+
+def init_speculator(
+    vllm_config: VllmConfig,
+    device: torch.device,
+):
+    speculative_config = vllm_config.speculative_config
+    assert speculative_config is not None
+    if speculative_config.use_eagle():
+        from vllm.v1.worker.gpu.spec_decode.eagle import EagleSpeculator
+
+        return EagleSpeculator(vllm_config, device)
+    raise NotImplementedError(f"{speculative_config.method} is not supported yet.")
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle.py b/vllm/v1/worker/gpu/spec_decode/eagle.py
new file mode 100644
index 000000000000..8848e220eb5b
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle.py
@@ -0,0 +1,565 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.forward_context import set_forward_context
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader import get_model
+from vllm.triton_utils import tl, triton
+from vllm.utils.platform_utils import is_pin_memory_available
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.input_batch import InputBatch, InputBuffers
+from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.spec_decode.eagle_cudagraph import EagleCudaGraphManager
+
+logger = init_logger(__name__)
+
+
+class EagleSpeculator:
+    def __init__(self, vllm_config: VllmConfig, device: torch.device):
+        self.vllm_config = vllm_config
+        self.device = device
+
+        self.speculative_config = vllm_config.speculative_config
+        assert self.speculative_config is not None
+        self.method = self.speculative_config.method
+        self.num_speculative_steps = self.speculative_config.num_speculative_tokens
+        self.draft_model_config = self.speculative_config.draft_model_config
+
+        self.scheduler_config = vllm_config.scheduler_config
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.max_model_len = vllm_config.model_config.max_model_len
+        # We need to get the hidden size from the draft model config because
+        # the draft model's hidden size can be different from the target model's
+        # hidden size (e.g., Llama 3.3 70B).
+        self.hidden_size = self.draft_model_config.get_hidden_size()
+        self.inputs_embeds_size = self.draft_model_config.get_inputs_embeds_size()
+        self.vocab_size = self.draft_model_config.get_vocab_size()
+        self.pin_memory = is_pin_memory_available()
+        self.dtype = vllm_config.model_config.dtype
+
+        self.input_buffers = InputBuffers(
+            max_num_reqs=self.max_num_reqs,
+            max_num_tokens=self.max_num_tokens,
+            inputs_embeds_size=self.inputs_embeds_size,
+            vocab_size=self.vocab_size,
+            dtype=self.dtype,
+            device=device,
+            pin_memory=self.pin_memory,
+        )
+        self.hidden_states = torch.zeros(
+            self.max_num_tokens,
+            self.hidden_size,
+            dtype=self.dtype,
+            device=device,
+        )
+        self.temperature = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.float32,
+            device=device,
+        )
+        self.seeds = torch.zeros(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device=device,
+        )
+        self.draft_tokens = torch.zeros(
+            self.max_num_reqs,
+            self.num_speculative_steps,
+            dtype=torch.int64,
+            device=device,
+        )
+
+        self.cudagraph_manager = EagleCudaGraphManager(vllm_config, device)
+
+    def load_model(self, target_model: nn.Module) -> None:
+        from vllm.compilation.backends import set_model_tag
+
+        with set_model_tag("eagle_head"):
+            self.model = get_model(
+                vllm_config=self.vllm_config, model_config=self.draft_model_config
+            )
+
+        share_lm_head = True
+        if share_lm_head and hasattr(target_model, "lm_head"):
+            if hasattr(self.model, "lm_head"):
+                del self.model.lm_head
+            self.model.lm_head = target_model.lm_head
+
+    def set_attn(
+        self,
+        kv_cache_config: KVCacheConfig,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        block_tables: BlockTables,
+    ) -> None:
+        self.kv_cache_config = kv_cache_config
+        self.attn_metadata_builders = attn_metadata_builders
+        self.block_tables = block_tables
+
+    @torch.inference_mode()
+    def run_model(
+        self,
+        num_tokens: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        with set_forward_context(
+            attn_metadata,
+            self.vllm_config,
+            num_tokens=num_tokens,
+            cudagraph_runtime_mode=CUDAGraphMode.NONE,
+            num_tokens_across_dp=num_tokens_across_dp,
+        ):
+            ret_hidden_states = self.model(
+                input_ids=self.input_buffers.input_ids[:num_tokens],
+                positions=self.input_buffers.positions[:num_tokens],
+                hidden_states=self.hidden_states[:num_tokens],
+            )
+        if self.method == "mtp":
+            last_hidden_states = ret_hidden_states
+            hidden_states = ret_hidden_states
+        else:
+            last_hidden_states, hidden_states = ret_hidden_states
+        return last_hidden_states, hidden_states
+
+    def generate_draft(
+        self,
+        num_reqs: int,
+        attn_metadata: dict[str, Any],
+        num_tokens_across_dp: torch.Tensor | None,
+    ) -> None:
+        pos = self.input_buffers.positions[:num_reqs]
+        query_start_loc = self.input_buffers.query_start_loc.gpu[: num_reqs + 1]
+        for step in range(1, self.num_speculative_steps):
+            # Run the eagle model.
+            last_hidden_states, hidden_states = self.run_model(
+                num_reqs, attn_metadata, num_tokens_across_dp
+            )
+            logits = self.model.compute_logits(last_hidden_states)
+
+            # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+            # used for draft and target sampling.
+            draft_tokens = gumbel_sample(
+                logits,
+                self.temperature[:num_reqs],
+                self.seeds[:num_reqs],
+                pos + 1,
+                apply_temperature=True,
+            )
+            self.draft_tokens[:num_reqs, step] = draft_tokens
+
+            if step < self.num_speculative_steps - 1:
+                # Update the inputs for the next step.
+                update_eagle_inputs(
+                    draft_tokens,
+                    hidden_states,
+                    self.input_buffers,
+                    self.hidden_states,
+                    self.max_model_len,
+                )
+                self.block_tables.compute_slot_mappings(query_start_loc, pos)
+
+    def capture_model(self) -> None:
+        if self.num_speculative_steps == 1:
+            return
+        logger.info("Capturing model for Eagle speculator...")
+        self.cudagraph_manager.capture(
+            self.generate_draft,
+            self.input_buffers,
+            self.block_tables,
+            self.attn_metadata_builders,
+            self.kv_cache_config,
+        )
+
+    @torch.inference_mode()
+    def propose(
+        self,
+        input_batch: InputBatch,
+        sampling_metadata: SamplingMetadata,
+        # [num_tokens, hidden_size]
+        last_hidden_states: torch.Tensor,
+        # num_layers x [num_tokens, hidden_size]
+        aux_hidden_states: list[torch.Tensor] | None,
+        # [num_reqs]
+        num_sampled: torch.Tensor,
+        # [num_reqs]
+        num_rejected: torch.Tensor,
+        # [num_reqs]
+        last_sampled: torch.Tensor,
+        # [num_reqs]
+        next_prefill_tokens: torch.Tensor,
+    ) -> torch.Tensor:
+        # NOTE(woosuk): To avoid CPU-GPU synchronization without CPU knowing the
+        # number of rejected tokens, we maintain the size of eagle's input_ids and
+        # hidden_states the same as the target model's. This means, we pad each
+        # request's query length to include any rejected positions. By doing so,
+        # we can also reuse the attention metadata (e.g., query_start_loc,
+        # seq_lens) of the target model.
+        if aux_hidden_states:
+            assert self.method == "eagle3"
+            hidden_states = self.model.combine_hidden_states(
+                torch.cat(aux_hidden_states, dim=-1)
+            )
+        else:
+            hidden_states = last_hidden_states
+        num_tokens = input_batch.num_tokens_after_padding
+        self.hidden_states[:num_tokens] = hidden_states
+
+        # Get the input ids and last token indices for the speculator.
+        last_token_indices = prepare_eagle_inputs(
+            self.input_buffers,
+            input_batch,
+            num_sampled,
+            num_rejected,
+            last_sampled,
+            next_prefill_tokens,
+        )
+
+        # Prefill: Run the eagle speculator with eager mode.
+        # TODO(woosuk): Support CUDA graph for prefill.
+        last_hidden_states, hidden_states = self.run_model(
+            num_tokens,
+            input_batch.attn_metadata,
+            num_tokens_across_dp=None,  # FIXME
+        )
+        sample_hidden_states = last_hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states)
+
+        num_reqs = input_batch.num_reqs
+        cu_num_logits = input_batch.cu_num_logits[:num_reqs]
+        # NOTE(woosuk): For draft sampling, we only consider the temperature
+        # and ignore the other sampling parameters such as top_k and top_p,
+        # for simplicity and performance.
+        # While this may slightly degrade the acceptance rate, it does not
+        # affect the output distribution after rejection sampling.
+        temperature = self.temperature[:num_reqs]
+        seeds = self.seeds[:num_reqs]
+        pos = self.input_buffers.positions[:num_reqs]
+        # Gather the values and copy them to the pre-allocated buffers.
+        torch.gather(sampling_metadata.temperature, 0, cu_num_logits, out=temperature)
+        torch.gather(sampling_metadata.seeds, 0, cu_num_logits, out=seeds)
+        torch.gather(input_batch.positions, 0, last_token_indices, out=pos)
+        # NOTE(woosuk): We must add 1 to the positions to match the Gumbel noise
+        # used for draft and target sampling.
+        draft_tokens = gumbel_sample(
+            logits, temperature, seeds, pos + 1, apply_temperature=True
+        )
+        if self.num_speculative_steps == 1:
+            # Early exit.
+            return draft_tokens.view(-1, 1)
+
+        # Save the draft tokens for the first step.
+        self.draft_tokens[:num_reqs, 0] = draft_tokens
+        # Prepare the inputs for the decode steps.
+        prepare_eagle_decode(
+            draft_tokens,
+            hidden_states,
+            last_token_indices,
+            input_batch.seq_lens,
+            num_rejected,
+            self.input_buffers,
+            self.hidden_states,
+            self.max_model_len,
+            self.max_num_reqs,
+        )
+        query_start_loc = self.input_buffers.query_start_loc
+        query_start_loc_gpu = query_start_loc.gpu[: num_reqs + 1]
+        slot_mappings = self.block_tables.compute_slot_mappings(
+            query_start_loc_gpu, pos
+        )
+
+        cudagraph_size = self.cudagraph_manager.get_cudagraph_size(num_reqs)
+        if cudagraph_size is not None:
+            # Run CUDA graph.
+            self.cudagraph_manager.run(cudagraph_size)
+            return self.draft_tokens[:num_reqs]
+
+        # Run eager mode.
+        query_start_loc.np[: num_reqs + 1] = np.arange(num_reqs + 1)
+        query_start_loc_cpu = query_start_loc.cpu[: num_reqs + 1]
+        # HACK(woosuk)
+        seq_lens_np = np.full(num_reqs, self.max_model_len, dtype=np.int32)
+        block_tables = [x[:num_reqs] for x in self.block_tables.input_block_tables]
+
+        # FIXME(woosuk): This is UNSAFE!!
+        attn_metadata = build_attn_metadata(
+            attn_metadata_builders=self.attn_metadata_builders,
+            num_reqs=num_reqs,
+            num_tokens=num_reqs,
+            query_start_loc_gpu=query_start_loc_gpu,
+            query_start_loc_cpu=query_start_loc_cpu,
+            seq_lens=self.input_buffers.seq_lens[:num_reqs],
+            seq_lens_np=seq_lens_np,
+            num_computed_tokens_cpu=None,  # FIXME
+            block_tables=block_tables,
+            slot_mappings=slot_mappings,
+            kv_cache_config=self.kv_cache_config,
+        )
+        self.generate_draft(num_reqs, attn_metadata, num_tokens_across_dp=None)  # FIXME
+        return self.draft_tokens[:num_reqs]
+
+
+@triton.jit
+def _prepare_eagle_inputs_kernel(
+    last_token_indices_ptr,
+    eagle_input_ids_ptr,
+    eagle_positions_ptr,
+    target_input_ids_ptr,
+    target_positions_ptr,
+    last_sampled_ptr,
+    next_prefill_tokens_ptr,
+    num_sampled_ptr,
+    num_rejected_ptr,
+    query_start_loc_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    batch_idx = tl.program_id(0)
+    query_start = tl.load(query_start_loc_ptr + batch_idx)
+    query_end = tl.load(query_start_loc_ptr + batch_idx + 1)
+    query_len = query_end - query_start
+
+    # Get the true query length and next token after accounting for rejected tokens.
+    num_rejected = tl.load(num_rejected_ptr + batch_idx)
+    query_len -= num_rejected
+
+    num_sampled = tl.load(num_sampled_ptr + batch_idx)
+    if num_sampled > 0:
+        next_token = tl.load(last_sampled_ptr + batch_idx).to(tl.int32)
+    else:
+        # Chunked prefilling.
+        # Get the next prefill token.
+        next_token = tl.load(next_prefill_tokens_ptr + batch_idx)
+
+    # Shift target_input_ids by one.
+    for i in range(1, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        input_ids = tl.load(target_input_ids_ptr + query_start + block, mask=mask)
+        tl.store(eagle_input_ids_ptr + query_start + block - 1, input_ids, mask=mask)
+
+    last_token_index = query_start + query_len - 1
+    tl.store(last_token_indices_ptr + batch_idx, last_token_index)
+    tl.store(eagle_input_ids_ptr + last_token_index, next_token)
+
+    # Copy positions.
+    for i in range(0, query_len, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < query_len
+        target_pos = tl.load(target_positions_ptr + query_start + block, mask=mask)
+        tl.store(eagle_positions_ptr + query_start + block, target_pos, mask=mask)
+
+
+def prepare_eagle_inputs(
+    input_buffers: InputBuffers,
+    input_batch: InputBatch,
+    # [num_reqs]
+    num_sampled: torch.Tensor,
+    # [num_reqs]
+    num_rejected: torch.Tensor,
+    # [num_reqs]
+    last_sampled: torch.Tensor,
+    # [num_reqs]
+    next_prefill_tokens: torch.Tensor,
+) -> torch.Tensor:
+    num_reqs = input_batch.num_reqs
+    last_token_indices = torch.empty(
+        num_reqs,
+        dtype=torch.int64,
+        device=num_sampled.device,
+    )
+    _prepare_eagle_inputs_kernel[(num_reqs,)](
+        last_token_indices,
+        input_buffers.input_ids,
+        input_buffers.positions,
+        input_batch.input_ids,
+        input_batch.positions,
+        last_sampled,
+        next_prefill_tokens,
+        num_sampled,
+        num_rejected,
+        input_batch.query_start_loc,
+        BLOCK_SIZE=1024,
+    )
+    return last_token_indices
+
+
+@triton.jit
+def _prepare_eagle_docode_kernel(
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    last_token_indices_ptr,
+    target_seq_lens_ptr,
+    num_rejected_ptr,
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    query_start_loc_ptr,
+    seq_lens_ptr,
+    hidden_size,
+    max_model_len,
+    max_num_reqs,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+    num_reqs = tl.num_programs(0) - 1
+    if req_idx == num_reqs:
+        # Compute query_start_loc. Pad it with the last query_start_loc
+        # for CUDA graphs.
+        for i in range(0, max_num_reqs + 1, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            q = tl.where(block < num_reqs, block, num_reqs)
+            mask = block < max_num_reqs + 1
+            tl.store(query_start_loc_ptr + block, q, mask=mask)
+        # Pad seq_lens for CUDA graphs.
+        for i in range(req_idx, max_num_reqs, BLOCK_SIZE):
+            block = i + tl.arange(0, BLOCK_SIZE)
+            mask = block < max_num_reqs
+            tl.store(seq_lens_ptr + block, 0, mask=mask)
+        return
+
+    # draft token -> input id.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # output hidden states -> input hidden states.
+    src_idx = tl.load(last_token_indices_ptr + req_idx)
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + src_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Compute position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    target_seq_len = tl.load(target_seq_lens_ptr + req_idx)
+    num_rejected = tl.load(num_rejected_ptr + req_idx)
+    seq_len = target_seq_len - num_rejected
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def prepare_eagle_decode(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    last_token_indices: torch.Tensor,
+    target_seq_lens: torch.Tensor,
+    num_rejected: torch.Tensor,
+    input_buffers: InputBuffers,
+    input_hidden_states: torch.Tensor,
+    max_model_len: int,
+    max_num_reqs: int,
+):
+    num_reqs = draft_tokens.shape[0]
+    hidden_size = output_hidden_states.shape[-1]
+    _prepare_eagle_docode_kernel[(num_reqs + 1,)](
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        last_token_indices,
+        target_seq_lens,
+        num_rejected,
+        input_buffers.input_ids,
+        input_buffers.positions,
+        input_hidden_states,
+        input_hidden_states.stride(0),
+        input_buffers.query_start_loc.gpu,
+        input_buffers.seq_lens,
+        hidden_size,
+        max_model_len,
+        max_num_reqs,
+        BLOCK_SIZE=1024,
+    )
+
+
+@triton.jit
+def _update_eagle_inputs_kernel(
+    input_ids_ptr,
+    positions_ptr,
+    input_hidden_states_ptr,
+    input_hidden_states_stride,
+    seq_lens_ptr,
+    max_model_len,
+    draft_tokens_ptr,
+    output_hidden_states_ptr,
+    output_hidden_states_stride,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    req_idx = tl.program_id(0)
+
+    # Draft token -> Input ID.
+    draft_token = tl.load(draft_tokens_ptr + req_idx)
+    tl.store(input_ids_ptr + req_idx, draft_token)
+
+    # Output hidden states -> Input hidden states.
+    for i in range(0, hidden_size, BLOCK_SIZE):
+        block = i + tl.arange(0, BLOCK_SIZE)
+        mask = block < hidden_size
+        output_hidden_states = tl.load(
+            output_hidden_states_ptr + req_idx * output_hidden_states_stride + block,
+            mask=mask,
+        )
+        tl.store(
+            input_hidden_states_ptr + req_idx * input_hidden_states_stride + block,
+            output_hidden_states,
+            mask=mask,
+        )
+
+    # Increment position and seq_lens.
+    # NOTE(woosuk): To prevent out-of-range access, we clamp these values
+    # if they reach the max model length.
+    position = tl.load(positions_ptr + req_idx)
+    position = tl.minimum(position + 1, max_model_len - 1)
+    tl.store(positions_ptr + req_idx, position)
+
+    seq_len = tl.load(seq_lens_ptr + req_idx)
+    seq_len = tl.minimum(seq_len + 1, max_model_len)
+    tl.store(seq_lens_ptr + req_idx, seq_len)
+
+
+def update_eagle_inputs(
+    draft_tokens: torch.Tensor,
+    output_hidden_states: torch.Tensor,
+    input_buffers: InputBuffers,
+    hidden_states: torch.Tensor,
+    max_model_len: int,
+):
+    num_reqs, hidden_size = output_hidden_states.shape
+    _update_eagle_inputs_kernel[(num_reqs,)](
+        input_buffers.input_ids,
+        input_buffers.positions,
+        hidden_states,
+        hidden_states.stride(0),
+        input_buffers.seq_lens,
+        max_model_len,
+        draft_tokens,
+        output_hidden_states,
+        output_hidden_states.stride(0),
+        hidden_size,
+        BLOCK_SIZE=1024,
+    )
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
new file mode 100644
index 000000000000..dcdeedda60a7
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/eagle_cudagraph.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import torch
+
+from vllm.config import VllmConfig
+from vllm.config.compilation import CUDAGraphMode
+from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
+from vllm.v1.kv_cache_interface import KVCacheConfig
+from vllm.v1.worker.gpu.block_table import BlockTables
+from vllm.v1.worker.gpu.cudagraph_utils import (
+    capture_graphs,
+    get_cudagraph_sizes,
+    prepare_inputs_to_capture,
+)
+from vllm.v1.worker.gpu.dp_utils import make_num_tokens_across_dp
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+class EagleCudaGraphManager:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.scheduler_config = vllm_config.scheduler_config
+        self.device = device
+
+        self.max_model_len = vllm_config.model_config.max_model_len
+        self.max_num_reqs = self.scheduler_config.max_num_seqs
+        self.max_num_tokens = self.scheduler_config.max_num_batched_tokens
+        self.dp_size = vllm_config.parallel_config.data_parallel_size
+        self.compilation_config = vllm_config.compilation_config
+        assert self.compilation_config is not None
+
+        cudagraph_mode: CUDAGraphMode
+        if self.compilation_config.cudagraph_mode is None:
+            cudagraph_mode = CUDAGraphMode.NONE
+        else:
+            cudagraph_mode = self.compilation_config.cudagraph_mode
+            if cudagraph_mode == CUDAGraphMode.FULL:
+                # NOTE(woosuk): For Eagle, we only use CUDA graphs for decode.
+                cudagraph_mode = CUDAGraphMode.FULL_DECODE_ONLY
+
+        self.cudagraph_mode = cudagraph_mode
+
+        self.cudagraph_sizes = get_cudagraph_sizes(
+            self.compilation_config.cudagraph_capture_sizes,
+            self.max_num_reqs,
+            self.max_num_tokens,
+            self.cudagraph_mode,
+        )
+
+        self.graphs: dict[int, torch.cuda.CUDAGraph] = {}
+        self.pool = torch.cuda.graph_pool_handle()
+
+    def get_cudagraph_size(self, num_tokens: int) -> int | None:
+        return self.cudagraph_sizes.get(num_tokens)
+
+    def capture_graph(
+        self,
+        num_tokens: int,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        num_reqs = min(num_tokens, self.max_num_reqs)
+        attn_metadata = prepare_inputs_to_capture(
+            num_reqs,
+            num_tokens,
+            input_buffers,
+            block_tables,
+            attn_metadata_builders,
+            self.max_model_len,
+            kv_cache_config,
+        )
+        num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens)
+
+        # Warm up.
+        generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+
+        # Capture the graph.
+        assert num_tokens not in self.graphs
+        graph = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph, self.pool):
+            generate_fn(num_tokens, attn_metadata, num_tokens_across_dp)
+        self.graphs[num_tokens] = graph
+
+    @torch.inference_mode()
+    def capture(
+        self,
+        generate_fn: Callable,
+        input_buffers: InputBuffers,
+        block_tables: BlockTables,
+        attn_metadata_builders: list[AttentionMetadataBuilder],
+        kv_cache_config: KVCacheConfig,
+    ) -> None:
+        capture_graphs(
+            self.cudagraph_sizes,
+            self.device,
+            self.capture_graph,
+            generate_fn=generate_fn,
+            input_buffers=input_buffers,
+            block_tables=block_tables,
+            attn_metadata_builders=attn_metadata_builders,
+            kv_cache_config=kv_cache_config,
+        )
+
+    def run(self, num_tokens: int) -> None:
+        assert num_tokens in self.graphs
+        self.graphs[num_tokens].replay()
diff --git a/vllm/v1/worker/gpu/spec_decode/rejection_sample.py b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
new file mode 100644
index 000000000000..43c6ac518bcc
--- /dev/null
+++ b/vllm/v1/worker/gpu/spec_decode/rejection_sample.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+
+from vllm.triton_utils import tl, triton
+
+
+@triton.jit
+def _rejection_sample_kernel(
+    sampled_ptr,  # [num_reqs, num_speculative_steps + 1]
+    sampled_stride,
+    num_sampled_ptr,  # [num_reqs]
+    target_sampled_ptr,  # [num_draft_tokens + num_reqs]
+    input_ids_ptr,  # [num_draft_tokens + num_reqs]
+    cu_num_logits_ptr,  # [num_reqs + 1]
+):
+    req_idx = tl.program_id(0)
+    start_idx = tl.load(cu_num_logits_ptr + req_idx)
+    end_idx = tl.load(cu_num_logits_ptr + req_idx + 1)
+    num_tokens = end_idx - start_idx
+
+    num_sampled = 0
+    rejected = False
+    for i in range(num_tokens - 1):
+        if not rejected:
+            target_sampled = tl.load(target_sampled_ptr + start_idx + i)
+            draft_sampled = tl.load(input_ids_ptr + start_idx + i + 1)
+            tl.store(sampled_ptr + req_idx * sampled_stride + i, target_sampled)
+            num_sampled += 1
+            if target_sampled != draft_sampled:
+                rejected = True
+    if not rejected:
+        target_sampled = tl.load(target_sampled_ptr + start_idx + num_tokens - 1)
+        tl.store(
+            sampled_ptr + req_idx * sampled_stride + num_tokens - 1, target_sampled
+        )
+        num_sampled += 1
+    tl.store(num_sampled_ptr + req_idx, num_sampled)
+
+
+def rejection_sample(
+    # [num_draft_tokens + num_reqs]
+    target_sampled: torch.Tensor,
+    # [num_draft_tokens + num_reqs]
+    input_ids: torch.Tensor,
+    # [num_reqs + 1]
+    cu_num_logits: torch.Tensor,
+    num_speculative_steps: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    num_reqs = cu_num_logits.shape[0] - 1
+    sampled = torch.empty(
+        num_reqs,
+        num_speculative_steps + 1,
+        dtype=target_sampled.dtype,
+        device=target_sampled.device,
+    )
+    num_sampled = torch.empty(
+        num_reqs,
+        dtype=torch.int32,
+        device=target_sampled.device,
+    )
+    _rejection_sample_kernel[(num_reqs,)](
+        sampled,
+        sampled.stride(0),
+        num_sampled,
+        target_sampled,
+        input_ids,
+        cu_num_logits,
+        num_warps=1,
+    )
+    return sampled, num_sampled
+
+
+@torch.compile(dynamic=True)
+def get_num_rejected(
+    cu_num_logits: torch.Tensor,
+    num_sampled: torch.Tensor,
+) -> torch.Tensor:
+    num_logits = cu_num_logits[1:] - cu_num_logits[:-1]
+    num_rejected = num_logits - num_sampled
+    # No token is rejected for chunked prefills.
+    num_rejected *= num_sampled > 0
+    return num_rejected
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
new file mode 100644
index 000000000000..367348c4a18f
--- /dev/null
+++ b/vllm/v1/worker/gpu/states.py
@@ -0,0 +1,309 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingParams
+from vllm.utils.math_utils import cdiv
+from vllm.utils.platform_utils import is_uva_available
+from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor
+from vllm.v1.outputs import LogprobsTensors
+from vllm.v1.utils import CpuGpuBuffer
+from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata
+from vllm.v1.worker.gpu.sample.penalties import bincount
+
+_NP_INT64_MIN = np.iinfo(np.int64).min
+_NP_INT64_MAX = np.iinfo(np.int64).max
+NO_LORA_ID = 0
+
+
+class RequestState:
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        num_speculative_steps: int,
+        vocab_size: int,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_model_len = max_model_len
+        self.max_num_batched_tokens = max_num_batched_tokens
+        self.num_speculative_steps = num_speculative_steps
+        self.vocab_size = vocab_size
+        self.device = device
+        self.pin_memory = pin_memory
+
+        self.req_id_to_index: dict[str, int] = {}
+        self.index_to_req_id: dict[int, str] = {}
+        self.free_indices = list(range(max_num_reqs))
+        self.extra_data: dict[str, ExtraData] = {}
+
+        self.prompt_len = np.zeros(self.max_num_reqs, dtype=np.int32)
+        # NOTE(woosuk): This tensor can be extremely large (e.g., several GBs)
+        # depending on the configured max_num_reqs and max_model_len.
+        self.prefill_token_ids = UvaBuffer(
+            self.max_num_reqs, self.max_model_len, dtype=torch.int32
+        )
+        # NOTE(woosuk): We don't use UVA for prefill_len because its GPU view
+        # can be used outside of update_states and prepare_inputs.
+        # Without async barrier, using UVA can cause race conditions.
+        self.prefill_len = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        # Number of computed tokens.
+        self.num_computed_prefill_tokens = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.num_computed_tokens = torch.zeros(
+            self.max_num_reqs, dtype=torch.int32, device=device
+        )
+
+        # Last sampled tokens.
+        self.last_sampled_tokens = torch.zeros(
+            self.max_num_reqs,
+            1,
+            dtype=torch.int64,
+            device=device,
+        )
+
+        # Draft tokens.
+        self.draft_tokens = torch.zeros(
+            self.max_num_reqs,
+            self.num_speculative_steps,
+            dtype=torch.int64,
+            device=device,
+        )
+        self.next_prefill_tokens = torch.zeros(
+            self.max_num_reqs, dtype=torch.int32, device=device
+        )
+
+        # LoRA.
+        self.lora_ids = np.zeros(self.max_num_reqs, dtype=np.int32)
+        self.lora_ids.fill(NO_LORA_ID)
+
+        # Sampling parameters.
+        self.temperature = self._make_param(self.max_num_reqs, torch.float32)
+        self.top_p = self._make_param(self.max_num_reqs, torch.float32)
+        self.top_k = self._make_param(self.max_num_reqs, torch.int32)
+        self.repetition_penalty = self._make_param(self.max_num_reqs, torch.float32)
+        self.frequency_penalty = self._make_param(self.max_num_reqs, torch.float32)
+        self.presence_penalty = self._make_param(self.max_num_reqs, torch.float32)
+        self.seeds = self._make_param(self.max_num_reqs, torch.int64)
+
+        self.num_logprobs = np.empty(self.max_num_reqs, dtype=np.int32)
+        # -1 means no logprobs are requested.
+        self.num_logprobs.fill(-1)
+        self.needs_prompt_logprobs = np.zeros(self.max_num_reqs, dtype=bool)
+
+        # Statistics for penalties.
+        self.prompt_bin_mask = torch.zeros(
+            self.max_num_reqs,
+            cdiv(self.vocab_size, 32),
+            dtype=torch.int32,
+            device=self.device,
+        )
+        # TODO(woosuk): This tensor is rarely used but can be extremely large.
+        # Optimize the memory usage.
+        self.output_bin_counts = torch.zeros(
+            self.max_num_reqs, self.vocab_size, dtype=torch.int32, device=self.device
+        )
+
+    def _make_param(self, size: int, dtype: torch.dtype) -> "Param":
+        return Param(size, dtype=dtype, device=self.device, pin_memory=self.pin_memory)
+
+    def _make_buffer(self, size: int, dtype: torch.dtype) -> CpuGpuBuffer:
+        return CpuGpuBuffer(
+            size, dtype=dtype, device=self.device, pin_memory=self.pin_memory
+        )
+
+    @property
+    def num_reqs(self) -> int:
+        return len(self.req_id_to_index)
+
+    def add_request(
+        self,
+        req_id: str,
+        prompt_len: int,
+        prefill_token_ids: list[int],
+        num_computed_tokens: int,
+        sampling_params: SamplingParams,
+        lora_request: LoRARequest | None,
+    ) -> None:
+        assert len(self.free_indices) > 0, "No free indices"
+        req_idx = self.free_indices.pop()
+        self.req_id_to_index[req_id] = req_idx
+        self.index_to_req_id[req_idx] = req_id
+        self.extra_data[req_id] = ExtraData(lora_request)
+
+        self.prompt_len[req_idx] = prompt_len
+        prefill_len = len(prefill_token_ids)
+        assert prefill_len >= prompt_len, (
+            f"prefill_len {prefill_len} < prompt_len {prompt_len}"
+        )
+        self.prefill_len.np[req_idx] = prefill_len
+        self.prefill_token_ids.np[req_idx, :prefill_len] = prefill_token_ids
+
+        self.num_computed_prefill_tokens[req_idx] = num_computed_tokens
+        # FIXME(woosuk): This triggers a GPU operation whenever adding a new request.
+        # Optimize this.
+        self.num_computed_tokens[req_idx] = num_computed_tokens
+
+        if lora_request is not None:
+            self.lora_ids[req_idx] = lora_request.lora_int_id
+        else:
+            self.lora_ids[req_idx] = NO_LORA_ID
+
+        self.temperature.np[req_idx] = sampling_params.temperature
+        self.top_p.np[req_idx] = sampling_params.top_p
+        if 0 < sampling_params.top_k < self.vocab_size:
+            top_k = sampling_params.top_k
+        else:
+            top_k = self.vocab_size
+        self.top_k.np[req_idx] = top_k
+        self.repetition_penalty.np[req_idx] = sampling_params.repetition_penalty
+        self.frequency_penalty.np[req_idx] = sampling_params.frequency_penalty
+        self.presence_penalty.np[req_idx] = sampling_params.presence_penalty
+
+        if use_penalty(sampling_params):
+            bincount(
+                self.prefill_token_ids.gpu[req_idx],
+                prefill_len,
+                prompt_len,
+                self.prompt_bin_mask[req_idx],
+                self.output_bin_counts[req_idx],
+            )
+
+        if sampling_params.seed is not None:
+            seed = sampling_params.seed
+        else:
+            seed = np.random.randint(_NP_INT64_MIN, _NP_INT64_MAX)
+        self.seeds.np[req_idx] = seed
+
+        if sampling_params.logprobs is not None:
+            num_logprobs = sampling_params.logprobs
+        else:
+            num_logprobs = -1
+        self.num_logprobs[req_idx] = num_logprobs
+
+        # For now, only support prompt logprobs for the prompt tokens.
+        needs_prompt_logprobs = sampling_params.prompt_logprobs is not None
+        self.needs_prompt_logprobs[req_idx] = needs_prompt_logprobs
+
+    def remove_request(self, req_id: str) -> None:
+        self.extra_data.pop(req_id, None)
+        req_idx = self.req_id_to_index.pop(req_id, None)
+        if req_idx is None:
+            # Request not found.
+            return
+        self.index_to_req_id.pop(req_idx, None)
+        self.free_indices.append(req_idx)
+
+    def make_sampling_metadata(
+        self,
+        idx_mapping: torch.Tensor,
+        idx_mapping_np: np.ndarray,
+        pos: torch.Tensor,
+    ) -> SamplingMetadata:
+        temperature = self.temperature.np[idx_mapping_np]
+        temperature = self.temperature.copy_np_to_gpu(temperature)
+
+        top_p = self.top_p.np[idx_mapping_np]
+        no_top_p = np.all(top_p == 1.0)
+        top_p = self.top_p.copy_np_to_gpu(top_p) if not no_top_p else None
+
+        top_k = self.top_k.np[idx_mapping_np]
+        no_top_k = np.all(top_k == self.vocab_size)
+        top_k = self.top_k.copy_np_to_gpu(top_k) if not no_top_k else None
+
+        rep_penalty = self.repetition_penalty.np[idx_mapping_np]
+        rep_penalty = self.repetition_penalty.copy_np_to_gpu(rep_penalty)
+        freq_penalty = self.frequency_penalty.np[idx_mapping_np]
+        freq_penalty = self.frequency_penalty.copy_np_to_gpu(freq_penalty)
+        pres_penalty = self.presence_penalty.np[idx_mapping_np]
+        pres_penalty = self.presence_penalty.copy_np_to_gpu(pres_penalty)
+
+        seeds = self.seeds.np[idx_mapping_np]
+        seeds = self.seeds.copy_np_to_gpu(seeds)
+
+        num_logprobs = self.num_logprobs[idx_mapping_np]
+        max_num_logprobs: int | None = int(np.max(num_logprobs))
+        if max_num_logprobs == -1:
+            max_num_logprobs = None
+
+        return SamplingMetadata(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repetition_penalty=rep_penalty,
+            frequency_penalty=freq_penalty,
+            presence_penalty=pres_penalty,
+            seeds=seeds,
+            pos=pos,
+            max_num_logprobs=max_num_logprobs,
+            idx_mapping=idx_mapping,
+            prompt_bin_mask=self.prompt_bin_mask,
+            output_bin_counts=self.output_bin_counts,
+        )
+
+    def make_lora_inputs(
+        self,
+        req_ids: list[str],
+        idx_mapping: np.ndarray,
+        num_scheduled_tokens: np.ndarray,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], set[LoRARequest]]:
+        lora_ids = self.lora_ids[idx_mapping]
+        prompt_lora_mapping = tuple(lora_ids)
+        token_lora_mapping = tuple(lora_ids.repeat(num_scheduled_tokens))
+
+        active_lora_requests: set[LoRARequest] = set()
+        for req_id in req_ids:
+            lora_request = self.extra_data[req_id].lora_request
+            if lora_request is not None:
+                active_lora_requests.add(lora_request)
+        return prompt_lora_mapping, token_lora_mapping, active_lora_requests
+
+
+class Param:
+    def __init__(
+        self,
+        size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        pin_memory: bool,
+    ):
+        self.buffer = CpuGpuBuffer(
+            size,
+            dtype=dtype,
+            device=device,
+            pin_memory=pin_memory,
+        )
+        self.np = np.zeros_like(self.buffer.np)
+
+    def copy_np_to_gpu(self, x: np.ndarray) -> torch.Tensor:
+        n = x.shape[0]
+        self.buffer.np[:n] = x
+        return self.buffer.copy_to_gpu(n)
+
+
+@dataclass
+class ExtraData:
+    lora_request: LoRARequest | None
+    in_progress_prompt_logprobs: list[LogprobsTensors] = field(default_factory=list)
+
+
+class UvaBuffer:
+    def __init__(self, *size: int | torch.SymInt, dtype: torch.dtype):
+        assert is_uva_available()
+        self.cpu = torch.zeros(*size, dtype=dtype, device="cpu", pin_memory=True)
+        self.np = self.cpu.numpy()
+        self.gpu = get_cuda_view_from_cpu_tensor(self.cpu)
+
+
+def use_penalty(sampling_params: SamplingParams) -> bool:
+    return (
+        sampling_params.repetition_penalty != 1.0
+        or sampling_params.frequency_penalty != 0.0
+        or sampling_params.presence_penalty != 0.0
+    )
diff --git a/vllm/v1/worker/gpu/structured_outputs.py b/vllm/v1/worker/gpu/structured_outputs.py
new file mode 100644
index 000000000000..83051b0ed33f
--- /dev/null
+++ b/vllm/v1/worker/gpu/structured_outputs.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+import torch
+
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu.input_batch import InputBuffers
+
+
+def apply_grammar_bitmask(
+    logits: torch.Tensor,
+    req_ids: list[str],
+    grammar_req_ids: list[str],
+    grammar_bitmask: np.ndarray,
+    input_buffers: InputBuffers,
+) -> None:
+    input_buffers.grammar_bitmask.np[: grammar_bitmask.shape[0]] = grammar_bitmask
+    input_buffers.grammar_bitmask.copy_to_gpu(grammar_bitmask.shape[0])
+
+    batch_size = logits.shape[0]
+    grammar_req_id_to_idx = {req_id: i for i, req_id in enumerate(grammar_req_ids)}
+    # logits -> bitmask mapping
+    mapping = [grammar_req_id_to_idx.get(req_id, -1) for req_id in req_ids]
+    input_buffers.bitmask_indices.np[:batch_size] = mapping
+    input_buffers.bitmask_indices.copy_to_gpu(batch_size)
+
+    vocab_size = logits.shape[-1]
+    BLOCK_SIZE = 8192
+    grid = (batch_size, triton.cdiv(vocab_size, BLOCK_SIZE))
+    _apply_grammar_bitmask_kernel[grid](
+        logits,
+        logits.stride(0),
+        input_buffers.grammar_bitmask.gpu,
+        input_buffers.grammar_bitmask.gpu.stride(0),
+        input_buffers.bitmask_indices.gpu,
+        vocab_size,
+        BLOCK_SIZE=BLOCK_SIZE,
+    )
+
+
+# Adapted from
+# https://github.com/mlc-ai/xgrammar/blob/main/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py
+@triton.jit
+def _apply_grammar_bitmask_kernel(
+    logits_ptr,
+    logits_stride,
+    bitmask_ptr,
+    bitmask_stride,
+    bitmask_indices_ptr,
+    vocab_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    logits_idx = tl.program_id(0)
+    bitmask_idx = tl.load(bitmask_indices_ptr + logits_idx)
+    if bitmask_idx == -1:
+        # No bitmask to apply.
+        return
+
+    # Load the bitmask.
+    block_id = tl.program_id(1)
+    bitmask_offset = (block_id * BLOCK_SIZE) // 32 + tl.arange(0, BLOCK_SIZE // 32)
+    packed_bitmask = tl.load(
+        bitmask_ptr + bitmask_idx * bitmask_stride + bitmask_offset,
+        mask=bitmask_offset < bitmask_stride,
+    )
+    # Unpack the bitmask.
+    bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0
+    bitmask = bitmask.reshape(BLOCK_SIZE)
+
+    # Apply the bitmask to the logits.
+    block_offset = block_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    tl.store(
+        logits_ptr + logits_idx * logits_stride + block_offset,
+        -float("inf"),
+        mask=bitmask & (block_offset < vocab_size),
+    )
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 393181f543d2..e7991baeaa1b 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -43,9 +43,14 @@ class CachedRequestState:
     mrope_positions: torch.Tensor | None = None
     mrope_position_delta: int | None = None
 
+    xdrope_positions: torch.Tensor | None = None
+
     lora_request: LoRARequest | None = None
     prompt_embeds: torch.Tensor | None = None
 
+    # Used when both async_scheduling and spec_decode are enabled.
+    prev_num_draft_len: int = 0
+
     def __post_init__(self):
         self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
             self.prompt_token_ids, self.prompt_embeds
@@ -84,7 +89,7 @@ def __init__(
         is_spec_decode: bool = False,
         is_pooling_model: bool = False,
         num_speculative_tokens: int = 0,
-        dcp_kv_cache_interleave_size: int = 1,
+        cp_kv_cache_interleave_size: int = 1,
     ):
         self.is_pooling_model = is_pooling_model
         self.is_spec_decode = is_spec_decode
@@ -138,7 +143,7 @@ def __init__(
             block_sizes=block_sizes,
             kernel_block_sizes=kernel_block_sizes,
             num_speculative_tokens=num_speculative_tokens,
-            dcp_kv_cache_interleave_size=dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=cp_kv_cache_interleave_size,
         )
 
         # Sampling-related.
@@ -216,9 +221,6 @@ def __init__(
         self.generators: dict[int, torch.Generator] = {}
 
         self.num_logprobs: dict[str, int] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
 
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
@@ -248,7 +250,7 @@ def __init__(
         self.logitsprocs_need_output_token_ids = logitsprocs_need_output_token_ids
 
         # Store last speculative tokens for sampler.
-        self.spec_token_ids: list[list[int] | None] = []
+        self.spec_token_ids: list[list[int]] = [[] for _ in range(max_num_reqs)]
 
         # This is updated each time the batch constituents change.
         self.sampling_metadata = self._make_sampling_metadata()
@@ -262,7 +264,7 @@ def __init__(
         # ids from prior step, if required by current sampling params
         # (e.g. penalties).
         self.sampled_token_ids_cpu: torch.Tensor | None = None
-        self.async_copy_ready_event: torch.cuda.Event | None = None
+        self.async_copy_ready_event: torch.Event | None = None
 
     @property
     def req_ids(self) -> list[str]:
@@ -310,7 +312,7 @@ def add_request(
         else:
             self._req_ids[req_index] = req_id
             self.req_output_token_ids[req_index] = request.output_token_ids
-            self.spec_token_ids[req_index] = []
+            self.spec_token_ids[req_index].clear()
 
         self.req_id_to_index[req_id] = req_index
 
@@ -382,12 +384,6 @@ def add_request(
                     if sampling_params.logprobs == -1
                     else sampling_params.logprobs
                 )
-            if sampling_params.prompt_logprobs is not None:
-                self.num_prompt_logprobs[req_id] = (
-                    self.vocab_size
-                    if sampling_params.prompt_logprobs == -1
-                    else sampling_params.prompt_logprobs
-                )
 
             if sampling_params.allowed_token_ids:
                 self.has_allowed_token_ids.add(req_id)
@@ -459,7 +455,7 @@ def remove_request(self, req_id: str) -> int | None:
         self.batch_update_builder.removed_append(req_index)
         self._req_ids[req_index] = None
         self.req_output_token_ids[req_index] = None
-        self.spec_token_ids[req_index] = None
+        self.spec_token_ids[req_index].clear()
 
         # LoRA
         lora_id = self.request_lora_mapping[req_index]
@@ -485,7 +481,6 @@ def remove_request(self, req_id: str) -> int | None:
         self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
-        self.num_prompt_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
         self.has_allowed_token_ids.discard(req_id)
@@ -532,7 +527,7 @@ def swap_states(self, i1: int, i2: int) -> None:
         # NOTE: the following is unsafe
         # self.token_ids_cpu[i1, ...], self.token_ids_cpu[i2, ...], =\
         #     self.token_ids_cpu[i2, ...], self.token_ids_cpu[i1, ...]
-        # instead, we need to temporiarily copy the data for one of the indices
+        # instead, we need to temporarily copy the data for one of the indices
         # TODO(lucas): optimize this by only copying valid indices
         tmp = self.token_ids_cpu[i1, ...].copy()
         self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...]
@@ -651,9 +646,15 @@ def condense(self) -> None:
             self.req_output_token_ids[last_req_index] = None
             self.req_id_to_index[req_id] = empty_index
 
-            spec_token_ids = self.spec_token_ids[last_req_index]
-            self.spec_token_ids[empty_index] = spec_token_ids
-            self.spec_token_ids[last_req_index] = None
+            if last_req_index != empty_index:
+                (
+                    self.spec_token_ids[last_req_index],
+                    self.spec_token_ids[empty_index],
+                ) = (
+                    self.spec_token_ids[empty_index],
+                    self.spec_token_ids[last_req_index],
+                )
+                self.spec_token_ids[last_req_index].clear()
 
             num_tokens = self.num_tokens[last_req_index]
             self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[
@@ -888,7 +889,7 @@ def make_lora_inputs(
     def set_async_sampled_token_ids(
         self,
         sampled_token_ids_cpu: torch.Tensor,
-        async_copy_ready_event: torch.cuda.Event,
+        async_copy_ready_event: torch.Event,
     ) -> None:
         """
         In async scheduling case, store ref to sampled_token_ids_cpu
@@ -963,10 +964,6 @@ def no_penalties(self) -> bool:
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return not self.num_prompt_logprobs
-
     @property
     def no_allowed_token_ids(self) -> bool:
         return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c9c64137ca04..2218e4f023f9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -5,9 +5,9 @@
 import itertools
 import time
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Iterator, Sequence
 from contextlib import contextmanager
-from copy import deepcopy
+from copy import copy, deepcopy
 from functools import reduce
 from itertools import product
 from typing import TYPE_CHECKING, Any, NamedTuple, TypeAlias, cast
@@ -19,12 +19,13 @@
 from tqdm import tqdm
 
 import vllm.envs as envs
-from vllm.attention import Attention, AttentionType
 from vllm.attention.backends.abstract import (
     AttentionBackend,
     AttentionMetadata,
+    AttentionType,
     MultipleOf,
 )
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.cuda_graph import CUDAGraphWrapper
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
@@ -50,15 +51,21 @@
 from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.model_executor.layers.rotary_embedding import (
+    MRotaryEmbedding,
+    XDRotaryEmbedding,
+)
 from vllm.model_executor.model_loader import TensorizerLoader, get_model_loader
 from vllm.model_executor.models.interfaces import (
+    SupportsMRoPE,
     SupportsMultiModal,
+    SupportsXDRoPE,
     is_mixture_of_experts,
     supports_eagle3,
     supports_mrope,
     supports_multimodal_pruning,
     supports_transcription,
+    supports_xdrope,
 )
 from vllm.model_executor.models.interfaces_base import (
     VllmModelForPooling,
@@ -126,6 +133,7 @@
 )
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors, build_logitsprocs
+from vllm.v1.sample.logits_processor.interface import LogitsProcessor
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.sample.rejection_sampler import RejectionSampler
 from vllm.v1.sample.sampler import Sampler
@@ -143,7 +151,6 @@
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 from vllm.v1.worker.ubatch_utils import (
-    UBatchSlice,
     UBatchSlices,
     check_ubatch_thresholds,
 )
@@ -176,19 +183,21 @@ def __init__(
         self,
         model_runner_output: ModelRunnerOutput,
         sampled_token_ids: torch.Tensor,
-        logprobs_tensors: torch.Tensor | None,
+        logprobs_tensors: LogprobsTensors | None,
         invalid_req_indices: list[int],
         async_output_copy_stream: torch.cuda.Stream,
+        vocab_size: int,
     ):
         self._model_runner_output = model_runner_output
         self._invalid_req_indices = invalid_req_indices
 
         # Event on the copy stream so we can synchronize the non-blocking copy.
-        self.async_copy_ready_event = torch.cuda.Event()
+        self.async_copy_ready_event = torch.Event()
 
         # Keep a reference to the device tensor to avoid it being
         # deallocated until we finish copying it to the host.
         self._sampled_token_ids = sampled_token_ids
+        self.vocab_size = vocab_size
         self._logprobs_tensors = logprobs_tensors
 
         # Initiate the copy on a separate stream, but do not synchronize it.
@@ -210,22 +219,29 @@ def get_output(self) -> ModelRunnerOutput:
 
         This function blocks until the copy is finished.
         """
+        max_gen_len = self.sampled_token_ids_cpu.shape[-1]
         self.async_copy_ready_event.synchronize()
 
         # Release the device tensors once the copy has completed.
         del self._logprobs_tensors
         del self._sampled_token_ids
-
-        valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
-        for i in self._invalid_req_indices:
-            valid_sampled_token_ids[i].clear()
+        if max_gen_len == 1:
+            valid_sampled_token_ids = self.sampled_token_ids_cpu.tolist()
+            for i in self._invalid_req_indices:
+                valid_sampled_token_ids[i].clear()
+            cu_num_tokens = None
+        else:
+            valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
+                self.sampled_token_ids_cpu,
+                self.vocab_size,
+                self._invalid_req_indices,
+                return_cu_num_tokens=self._logprobs_tensors_cpu is not None,
+            )
 
         output = self._model_runner_output
         output.sampled_token_ids = valid_sampled_token_ids
         if self._logprobs_tensors_cpu:
-            # NOTE(nick): this will need to be updated to use cu_num_accepted_tokens
-            # for async sched + spec decode + logprobs compatibility.
-            output.logprobs = self._logprobs_tensors_cpu.tolists()
+            output.logprobs = self._logprobs_tensors_cpu.tolists(cu_num_tokens)
         return output
 
 
@@ -240,7 +256,6 @@ class ExecuteModelState(NamedTuple):
     hidden_states: torch.Tensor
     sample_hidden_states: torch.Tensor
     aux_hidden_states: list[torch.Tensor] | None
-    kv_connector_output: KVConnectorOutput | None
     ec_connector_output: ECConnectorOutput | None
 
 
@@ -305,7 +320,7 @@ def __init__(
 
         # Model-related.
         self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
-        self.hidden_size = model_config.get_hidden_size()
+        self.inputs_embeds_size = model_config.get_inputs_embeds_size()
         self.attention_chunk_size = model_config.attention_chunk_size
         # Only relevant for models using ALiBi (e.g, MPT)
         self.use_alibi = model_config.uses_alibi
@@ -315,6 +330,7 @@ def __init__(
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
+        self.uses_xdrope_dim = model_config.uses_xdrope_dim
         self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs(
             model_config
         )
@@ -340,6 +356,9 @@ def __init__(
         # self.model: nn.Module  # Set after load_model
         # Initialize in initialize_kv_cache
         self.kv_caches: list[torch.Tensor] = []
+        # Initialize in initialize_kv_cache_tensors
+        self.cross_layers_kv_cache: torch.Tensor | None = None
+        self.cross_layers_attn_backend: type[AttentionBackend] | None = None
         # indexes: [kv_cache_group_id][attn_group]
         self.attn_groups: list[list[AttentionGroup]] = []
         # self.kv_cache_config: KVCacheConfig
@@ -363,7 +382,9 @@ def __init__(
             elif self.speculative_config.use_eagle():
                 self.drafter = EagleProposer(self.vllm_config, self.device, self)
                 if self.speculative_config.method == "eagle3":
-                    self.use_aux_hidden_state_outputs = True
+                    self.use_aux_hidden_state_outputs = (
+                        self.drafter.eagle3_use_aux_hidden_state
+                    )
             elif self.speculative_config.method == "medusa":
                 self.drafter = MedusaProposer(
                     vllm_config=self.vllm_config, device=self.device
@@ -375,8 +396,15 @@ def __init__(
                 )
             self.rejection_sampler = RejectionSampler(self.sampler)
 
+        self.num_spec_tokens = 0
+        if self.speculative_config:
+            self.num_spec_tokens = self.speculative_config.num_speculative_tokens
+
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
         self.comm_stream = torch.cuda.Stream()
 
         # Input Batch
@@ -388,7 +416,10 @@ def __init__(
         # solution, we initialize the input batch here, and re-initialize it
         # in `initialize_kv_cache` if the block_sizes here is different from
         # the block_sizes in the kv cache config.
-        custom_logitsprocs = model_config.logits_processors
+        logits_processors = model_config.logits_processors
+        custom_logitsprocs: Sequence[str | type[LogitsProcessor]] = (
+            tuple(logits_processors) if logits_processors is not None else ()
+        )
         self.input_batch = InputBatch(
             max_num_reqs=self.max_num_reqs,
             # We need to use the encoder length for encoder-decoer
@@ -412,7 +443,7 @@ def __init__(
             # uses output token ids so we set this conservatively.
             logitsprocs_need_output_token_ids=bool(custom_logitsprocs),
             is_pooling_model=self.is_pooling_model,
-            dcp_kv_cache_interleave_size=self.parallel_config.dcp_kv_cache_interleave_size,
+            cp_kv_cache_interleave_size=self.parallel_config.cp_kv_cache_interleave_size,
         )
 
         self.use_async_scheduling = self.scheduler_config.async_scheduling
@@ -421,10 +452,10 @@ def __init__(
         self.async_output_copy_stream: torch.cuda.Stream | None = None
         # cuda event to synchronize use of reused CPU tensors between steps
         # when async scheduling is enabled.
-        self.prepare_inputs_event: torch.cuda.Event | None = None
+        self.prepare_inputs_event: torch.Event | None = None
         if self.use_async_scheduling:
             self.async_output_copy_stream = torch.cuda.Stream()
-            self.prepare_inputs_event = torch.cuda.Event()
+            self.prepare_inputs_event = torch.Event()
 
         # self.cudagraph_batch_sizes sorts in ascending order.
         if (
@@ -445,6 +476,7 @@ def __init__(
             self.max_num_reqs + 1, dtype=torch.int32
         )
         self.seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
+        self.encoder_seq_lens = self._make_buffer(self.max_num_reqs, dtype=torch.int32)
         if self.dcp_world_size > 1:
             self.dcp_local_seq_lens = self._make_buffer(
                 self.max_num_reqs, dtype=torch.int32
@@ -453,14 +485,12 @@ def __init__(
         # version of this tensor, avoid a RuntimeError by not creating a
         # numpy buffer.
         self.inputs_embeds = self._make_buffer(
-            self.max_num_tokens, self.hidden_size, dtype=self.dtype, numpy=False
+            self.max_num_tokens, self.inputs_embeds_size, dtype=self.dtype, numpy=False
         )
         self.is_token_ids = self._make_buffer(self.max_num_tokens, dtype=torch.bool)
-        self.discard_request_indices = self._make_buffer(
-            self.max_num_reqs, dtype=torch.int64
+        self.discard_request_mask = self._make_buffer(
+            self.max_num_reqs, dtype=torch.bool
         )
-        self.num_discarded_requests = 0
-
         self.num_decode_draft_tokens = self._make_buffer(
             self.max_num_reqs, dtype=torch.int32
         )
@@ -488,6 +518,13 @@ def __init__(
                 (3, self.max_num_tokens + 1), dtype=torch.int64
             )
 
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            # Similar to mrope but use assigned dimension number for RoPE, 4 as default.
+            self.xdrope_positions = self._make_buffer(
+                (self.uses_xdrope_dim, self.max_num_tokens + 1), dtype=torch.int64
+            )
+
         # None in the first PP rank. The rest are set after load_model.
         self.intermediate_tensors: IntermediateTensors | None = None
 
@@ -511,11 +548,7 @@ def __init__(
                 self.max_num_tokens, dtype=torch.int32, device=self.device
             )
 
-        self.uniform_decode_query_len = (
-            1
-            if not self.speculative_config
-            else 1 + self.speculative_config.num_speculative_tokens
-        )
+        self.uniform_decode_query_len = 1 + self.num_spec_tokens
 
         # Cudagraph dispatcher for runtime cudagraph dispatching.
         self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
@@ -539,7 +572,7 @@ def __init__(
 
         # Cached outputs.
         self._draft_token_ids: list[list[int]] | torch.Tensor | None = None
-        self.transfer_event = torch.cuda.Event()
+        self.transfer_event = torch.Event()
         self.sampled_token_ids_pinned_cpu = torch.empty(
             (self.max_num_reqs, 1),
             dtype=torch.int64,
@@ -547,21 +580,84 @@ def __init__(
             pin_memory=self.pin_memory,
         )
 
+        # Pre-allocated tensor for copying valid sampled token counts to CPU,
+        # with dedicated stream for overlapping and event for coordination.
+        self.valid_sampled_token_count_event: torch.Event | None = None
+        self.valid_sampled_token_count_copy_stream: torch.cuda.Stream | None = None
+        if self.use_async_scheduling and self.num_spec_tokens:
+            self.valid_sampled_token_count_event = torch.Event()
+            self.valid_sampled_token_count_copy_stream = torch.cuda.Stream()
+        self.valid_sampled_token_count_cpu = torch.empty(
+            self.max_num_reqs,
+            dtype=torch.int64,
+            device="cpu",
+            pin_memory=self.pin_memory,
+        )
+
         # Ephemeral state transferred between execute_model() and sample_tokens().
         self.execute_model_state: ExecuteModelState | None = None
+        self.kv_connector_output: KVConnectorOutput | None = None
 
     def reset_mm_cache(self) -> None:
         if self.mm_budget:
             self.mm_budget.reset_cache()
 
+    @torch.inference_mode()
+    def init_fp8_kv_scales(self) -> None:
+        """
+        Re-initialize the KV cache and FP8 scales after waking from sleep.
+        1. Zero out the KV cache tensors to remove garbage data from re-allocation.
+        2. Reset Attention layer scaling factors (_k_scale, _v_scale) to 1.0.
+          If these are left at 0.0 (default after wake_up), all KV cache values
+          become effectively zero, causing gibberish output.
+        """
+        if not self.cache_config.cache_dtype.startswith("fp8"):
+            return
+
+        kv_caches = getattr(self, "kv_caches", [])
+        for cache_tensor in kv_caches:
+            if cache_tensor is not None:
+                cache_tensor.zero_()
+
+        k_attr_names = ("_k_scale", "k_scale")
+        v_attr_names = ("_v_scale", "v_scale")
+
+        attn_layers = self.compilation_config.static_forward_context
+        for name, module in attn_layers.items():
+            if isinstance(module, (Attention, MLAAttention)):
+                # TODO: Generally, scale is 1.0 if user uses on-the-fly fp8
+                # kvcache quant. However, to get better accuracy, compression
+                # frameworks like llm-compressors allow users to tune the
+                # scale. We may need to restore the specific calibrated scales
+                # here in the future.
+                k_scale_val, v_scale_val = 1.0, 1.0
+
+                # Processing K Scale
+                for attr in k_attr_names:
+                    if hasattr(module, attr):
+                        param = getattr(module, attr)
+                        if isinstance(param, torch.Tensor):
+                            param.fill_(k_scale_val)
+
+                # Processing V Scale
+                for attr in v_attr_names:
+                    if hasattr(module, attr):
+                        param = getattr(module, attr)
+                        if isinstance(param, torch.Tensor):
+                            param.fill_(v_scale_val)
+
     def _get_positions(self, num_tokens: Any):
         if isinstance(num_tokens, int):
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, :num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, :num_tokens]
             return self.positions.gpu[:num_tokens]
         else:
             if self.uses_mrope:
                 return self.mrope_positions.gpu[:, num_tokens]
+            if self.uses_xdrope_dim > 0:
+                return self.xdrope_positions.gpu[:, num_tokens]
             return self.positions.gpu[num_tokens]
 
     def _make_buffer(
@@ -628,16 +724,6 @@ def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
             return
 
         if self.reorder_batch_threshold is not None:
-            # NOTE(lucas): currently no backend supports the custom masking
-            #  required for DCP with q_len > 1, so we assert here. Remove this
-            #  assert once the custom mask is support is added to FA3.
-            if (
-                self.dcp_world_size > 1
-                and envs.VLLM_ATTENTION_BACKEND != "FLASH_ATTN_MLA"
-            ):
-                assert self.reorder_batch_threshold == 1, (
-                    "DCP not support reorder_batch_threshold > 1 now."
-                )
             reorder_batch_to_split_decodes_and_prefills(
                 self.input_batch,
                 scheduler_output,
@@ -667,6 +753,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.num_prompt_logprobs.pop(req_id, None)
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
         # scheduled_req_ids overlap. This happens when a request is aborted and
@@ -735,26 +822,65 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             )
             self.requests[req_id] = req_state
 
+            if sampling_params and sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[req_id] = (
+                    self.input_batch.vocab_size
+                    if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs
+                )
+
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
             if self.uses_mrope:
                 self._init_mrope_positions(req_state)
 
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            if self.uses_xdrope_dim > 0:
+                self._init_xdrope_positions(req_state)
+
             reqs_to_add.append(req_state)
 
         # Update the states of the running/resumed requests.
         is_last_rank = get_pp_group().is_last_rank
         req_data = scheduler_output.scheduled_cached_reqs
+
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        # then use it to update actual num_computed_tokens of each request.
+        valid_sampled_token_count = self._get_valid_sampled_token_count()
+
         for i, req_id in enumerate(req_data.req_ids):
             req_state = self.requests[req_id]
             num_computed_tokens = req_data.num_computed_tokens[i]
             new_block_ids = req_data.new_block_ids[i]
             resumed_from_preemption = req_id in req_data.resumed_req_ids
             num_output_tokens = req_data.num_output_tokens[i]
+            req_index = self.input_batch.req_id_to_index.get(req_id)
 
-            # Update the cached states.
+            # prev_num_draft_len is used in async scheduling mode with
+            # spec decode. it indicates if need to update num_computed_tokens
+            # of the request. for example:
+            # fist step: num_computed_tokens = 0, spec_tokens = [],
+            # prev_num_draft_len = 0.
+            # second step: num_computed_tokens = 100(prompt lenth),
+            # spec_tokens = [a,b], prev_num_draft_len = 0.
+            # third step: num_computed_tokens = 100 + 2, spec_tokens = [c,d],
+            # prev_num_draft_len = 2.
+            # num_computed_tokens in first step and second step does't contain
+            # the spec tokens length, but in third step it contains the
+            # spec tokens length. we only need to update num_computed_tokens
+            # when prev_num_draft_len > 0.
+            if req_state.prev_num_draft_len:
+                if req_index is None:
+                    req_state.prev_num_draft_len = 0
+                else:
+                    assert self.input_batch.prev_req_id_to_index is not None
+                    prev_req_index = self.input_batch.prev_req_id_to_index[req_id]
+                    num_accepted = valid_sampled_token_count[prev_req_index] - 1
+                    num_rejected = req_state.prev_num_draft_len - num_accepted
+                    num_computed_tokens -= num_rejected
+                    req_state.output_token_ids.extend([-1] * num_accepted)
 
+            # Update the cached states.
             req_state.num_computed_tokens = num_computed_tokens
-            req_index = self.input_batch.req_id_to_index.get(req_id)
 
             if not is_last_rank:
                 # When using PP, the scheduler sends the sampled tokens back,
@@ -831,8 +957,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
                 req_id, []
             )
-            if spec_token_ids:
-                num_spec_tokens = len(spec_token_ids)
+            num_spec_tokens = len(spec_token_ids)
+            # For async scheduling, token_ids_cpu assigned from
+            # spec_token_ids are placeholders and will be overwritten in
+            # _prepare_input_ids.
+            if num_spec_tokens:
                 start_index = self.input_batch.num_tokens_no_spec[req_index]
                 end_token_index = start_index + num_spec_tokens
                 self.input_batch.token_ids_cpu[
@@ -846,8 +975,18 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
             # conform to the schema. This can result in
             # scheduler_output.scheduled_spec_decode_tokens being empty,
             # even when speculative decoding is enabled.
-            self.input_batch.spec_token_ids[req_index] = spec_token_ids
+            self.input_batch.spec_token_ids[req_index].clear()
+            self.input_batch.spec_token_ids[req_index].extend(spec_token_ids)
 
+            # there are no draft tokens with async scheduling,
+            # we clear the spec_decoding info in scheduler_output and
+            # use normal sampling but rejection_sampling.
+            if self.use_async_scheduling:
+                req_state.prev_num_draft_len = num_spec_tokens
+                if num_spec_tokens and self._draft_token_ids is None:
+                    scheduler_output.total_num_scheduled_tokens -= num_spec_tokens
+                    scheduler_output.num_scheduled_tokens[req_id] -= num_spec_tokens
+                    scheduler_output.scheduled_spec_decode_tokens.pop(req_id, None)
         # Add the new or resumed requests to the persistent batch.
         # The smaller empty indices are filled first.
         for request in reqs_to_add:
@@ -901,14 +1040,31 @@ def _update_states_after_model_execute(
     def _init_mrope_positions(self, req_state: CachedRequestState):
         model = self.get_model()
         assert supports_mrope(model), "M-RoPE support is not implemented."
+        assert req_state.prompt_token_ids is not None, (
+            "M-RoPE requires prompt_token_ids to be available."
+        )
+        mrope_model = cast(SupportsMRoPE, model)
 
         req_state.mrope_positions, req_state.mrope_position_delta = (
-            model.get_mrope_input_positions(
+            mrope_model.get_mrope_input_positions(
                 req_state.prompt_token_ids,
                 req_state.mm_features,
             )
         )
 
+    def _init_xdrope_positions(self, req_state: CachedRequestState):
+        model = self.get_model()
+        xdrope_model = cast(SupportsXDRoPE, model)
+        assert req_state.prompt_token_ids is not None, (
+            "XD-RoPE requires prompt_token_ids to be available."
+        )
+        assert supports_xdrope(model), "XD-RoPE support is not implemented."
+
+        req_state.xdrope_positions = xdrope_model.get_xdrope_input_positions(
+            req_state.prompt_token_ids,
+            req_state.mm_features,
+        )
+
     def _extract_mm_kwargs(
         self,
         scheduler_output: "SchedulerOutput",
@@ -967,7 +1123,10 @@ def _get_cumsum_and_arange(
         return cu_num_tokens, arange
 
     def _prepare_input_ids(
-        self, total_num_scheduled_tokens: int, cu_num_tokens: np.ndarray
+        self,
+        scheduler_output: "SchedulerOutput",
+        total_num_scheduled_tokens: int,
+        cu_num_tokens: np.ndarray,
     ) -> None:
         """Prepare the input IDs for the current batch.
 
@@ -988,21 +1147,43 @@ def _prepare_input_ids(
         # on the GPU from prev_sampled_token_ids.
         prev_req_id_to_index = self.input_batch.prev_req_id_to_index
         assert prev_req_id_to_index is not None
-        flattened_indices = []
-        prev_common_req_indices = []
+        sample_flattened_indices: list[int] = []
+        spec_flattened_indices: list[int] = []
+        prev_common_req_indices: list[int] = []
+        prev_draft_token_indices: list[int] = []
         indices_match = True
         max_flattened_index = -1
+        total_num_spec_tokens = 0
+        scheduled_spec_tokens = scheduler_output.scheduled_spec_decode_tokens
+
         for req_id, cur_index in self.input_batch.req_id_to_index.items():
             if (prev_index := prev_req_id_to_index.get(req_id)) is not None:
                 prev_common_req_indices.append(prev_index)
                 # We need to compute the flattened input_ids index of the
                 # last token in each common request.
+                draft_len = len(scheduled_spec_tokens.get(req_id, ()))
+                total_num_spec_tokens += draft_len
                 flattened_index = cu_num_tokens[cur_index].item() - 1
-                flattened_indices.append(flattened_index)
+                # example: cu_num_tokens = [2, 5, 8], draft_tokens = [1, 2, 2]
+                # sample_flattened_indices = [0, 2, 5]
+                # spec_flattened_indices = [1,   3, 4,    6, 7]
+                sample_flattened_indices.append(flattened_index - draft_len)
+                spec_flattened_indices.extend(
+                    range(flattened_index - draft_len + 1, flattened_index + 1)
+                )
+                start = prev_index * self.num_spec_tokens
+                # prev_draft_token_indices is used to find which draft_tokens_id
+                # should be copied to input_ids
+                # example: prev draft_tokens_id [[1,2], [3,4], [5, 6]]
+                # flatten draft_tokens_id [1,2,3,4,5,6]
+                # draft_len of each request [1, 2, 1]
+                # then prev_draft_token_indices is [0,   2, 3,   4]
+                prev_draft_token_indices.extend(range(start, start + draft_len))
                 indices_match &= prev_index == flattened_index
                 max_flattened_index = max(max_flattened_index, flattened_index)
-        num_commmon_tokens = len(flattened_indices)
-        if num_commmon_tokens < total_num_scheduled_tokens:
+        num_commmon_tokens = len(sample_flattened_indices)
+        total_without_spec = total_num_scheduled_tokens - total_num_spec_tokens
+        if num_commmon_tokens < total_without_spec:
             # If not all requests are decodes from the last iteration,
             # We need to copy the input_ids_cpu to the GPU first.
             self.input_ids.copy_to_gpu(total_num_scheduled_tokens)
@@ -1026,53 +1207,86 @@ def _prepare_input_ids(
                 self.is_token_ids.gpu[:num_commmon_tokens] = True
             return
         # Upload the index tensors asynchronously so the scatter can be non-blocking.
-        input_ids_index_tensor = torch.tensor(
-            flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        sampled_tokens_index_tensor = torch.tensor(
+            sample_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
         ).to(self.device, non_blocking=True)
         prev_common_req_indices_tensor = torch.tensor(
             prev_common_req_indices, dtype=torch.int64, pin_memory=self.pin_memory
         ).to(self.device, non_blocking=True)
         self.input_ids.gpu.scatter_(
             dim=0,
-            index=input_ids_index_tensor,
+            index=sampled_tokens_index_tensor,
             src=self.input_batch.prev_sampled_token_ids[
                 prev_common_req_indices_tensor, 0
             ],
         )
 
+        # Scatter the draft tokens after the sampled tokens are scattered.
+        if self._draft_token_ids is None or not spec_flattened_indices:
+            return
+
+        assert isinstance(self._draft_token_ids, torch.Tensor)
+        draft_tokens_index_tensor = torch.tensor(
+            spec_flattened_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+        prev_draft_token_indices_tensor = torch.tensor(
+            prev_draft_token_indices, dtype=torch.int64, pin_memory=self.pin_memory
+        ).to(self.device, non_blocking=True)
+
+        # because input_ids dtype is torch.int32,
+        # so convert draft_token_ids to torch.int32 here.
+        draft_token_ids = self._draft_token_ids.to(dtype=torch.int32)
+        self._draft_token_ids = None
+
+        self.input_ids.gpu.scatter_(
+            dim=0,
+            index=draft_tokens_index_tensor,
+            src=draft_token_ids.flatten()[prev_draft_token_indices_tensor],
+        )
+
     def _get_encoder_seq_lens(
         self,
-        scheduled_encoder_inputs: dict[str, list[int]],
+        num_scheduled_tokens: dict[str, int],
         kv_cache_spec: KVCacheSpec,
         num_reqs: int,
-    ) -> np.ndarray | None:
+    ) -> tuple[torch.Tensor | None, np.ndarray | None]:
         if not isinstance(kv_cache_spec, CrossAttentionSpec):
-            return None
+            return None, None
 
         # Build encoder_seq_lens array mapping request indices to
         # encoder lengths for inputs scheduled in this batch
-        encoder_seq_lens = np.zeros(num_reqs, dtype=np.int32)
-        for req_id in scheduled_encoder_inputs:
+        for req_id in num_scheduled_tokens:
             req_index = self.input_batch.req_id_to_index[req_id]
-            encoder_seq_lens[req_index] = self.max_encoder_len
+            req_state = self.requests[req_id]
+            if req_state.mm_features is None:
+                self.encoder_seq_lens.np[req_index] = 0
+                continue
+
+            # Get the total number of encoder input tokens for running encoder requests
+            # whether encoding is finished or not so that cross-attention knows how
+            # many encoder tokens to attend to.
+            encoder_input_tokens = sum(
+                feature.mm_position.length for feature in req_state.mm_features
+            )
+            self.encoder_seq_lens.np[req_index] = encoder_input_tokens
+
+        self.encoder_seq_lens.copy_to_gpu(num_reqs)
+        encoder_seq_lens = self.encoder_seq_lens.gpu[:num_reqs]
+        encoder_seq_lens_cpu = self.encoder_seq_lens.np[:num_reqs]
 
-        return encoder_seq_lens
+        return encoder_seq_lens, encoder_seq_lens_cpu
 
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
         num_scheduled_tokens: np.ndarray,
-        max_num_scheduled_tokens: int,
     ) -> tuple[
         torch.Tensor,
         SpecDecodeMetadata | None,
-        UBatchSlices | None,
-        torch.Tensor | None,
     ]:
         """
         :return: tuple[
             logits_indices, spec_decode_metadata,
-            ubatch_slices, num_tokens_across_dp,
         ]
         """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
@@ -1105,6 +1319,11 @@ def _prepare_inputs(
         if self.uses_mrope:
             self._calc_mrope_positions(scheduler_output)
 
+        # Calculate XD-RoPE positions.
+        # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+        if self.uses_xdrope_dim > 0:
+            self._calc_xdrope_positions(scheduler_output)
+
         # Get token indices.
         # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
         # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
@@ -1182,28 +1401,6 @@ def _prepare_inputs(
         self.query_start_loc.copy_to_gpu()
         query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
 
-        num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
-        num_tokens_padded = self._get_num_input_tokens(num_tokens_unpadded)
-        uniform_decode = (
-            max_num_scheduled_tokens == self.uniform_decode_query_len
-        ) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-
-        # Disable DP padding when running eager to avoid excessive padding when
-        # running prefills. This lets us set enforce_eager on the prefiller in
-        # a P/D setup and still use CUDA graphs (enabled by this padding) on the
-        # decoder.
-        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-
-        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-            num_tokens_unpadded=num_tokens_unpadded,
-            parallel_config=self.parallel_config,
-            allow_microbatching=True,
-            allow_dp_padding=allow_dp_padding,
-            num_tokens_padded=num_tokens_padded,
-            uniform_decode=uniform_decode,
-            num_scheduled_tokens_per_request=num_scheduled_tokens,
-        )
-
         self.seq_lens.np[:num_reqs] = (
             self.input_batch.num_computed_tokens_cpu[:num_reqs] + num_scheduled_tokens
         )
@@ -1214,19 +1411,19 @@ def _prepare_inputs(
         num_tokens = [self.requests[r].num_tokens for r in self.input_batch.req_ids]
         num_tokens_np = np.array(num_tokens, dtype=np.int32)
 
-        # Record the index of requests that should not be sampled,
+        # Record which requests should not be sampled,
         # so that we could clear the sampled tokens before returning
-        discard_requests_mask = self.seq_lens.np[:num_reqs] < num_tokens_np
-        discard_request_indices = np.nonzero(discard_requests_mask)[0]
-        self.num_discarded_requests = len(discard_request_indices)
-        self.discard_request_indices.np[: self.num_discarded_requests] = (
-            discard_request_indices
+        self.discard_request_mask.np[:num_reqs] = (
+            self.seq_lens.np[:num_reqs] < num_tokens_np
         )
-
-        self.discard_request_indices.copy_to_gpu(self.num_discarded_requests)
+        self.discard_request_mask.copy_to_gpu(num_reqs)
 
         # Copy the tensors to the GPU.
-        self._prepare_input_ids(total_num_scheduled_tokens, cu_num_tokens)
+        self._prepare_input_ids(
+            scheduler_output,
+            total_num_scheduled_tokens,
+            cu_num_tokens,
+        )
 
         if self.uses_mrope:
             # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -1234,6 +1431,12 @@ def _prepare_inputs(
                 self.mrope_positions.cpu[:, :total_num_scheduled_tokens],
                 non_blocking=True,
             )
+        elif self.uses_xdrope_dim > 0:
+            # Only relevant for models using XD-RoPE (e.g, HunYuan-VL)
+            self.xdrope_positions.gpu[:, :total_num_scheduled_tokens].copy_(
+                self.xdrope_positions.cpu[:, :total_num_scheduled_tokens],
+                non_blocking=True,
+            )
         else:
             # Common case (1D positions)
             self.positions.copy_to_gpu(total_num_scheduled_tokens)
@@ -1294,27 +1497,30 @@ def _prepare_inputs(
         return (
             logits_indices,
             spec_decode_metadata,
-            ubatch_slices,
-            num_tokens_across_dp,
         )
 
     def _build_attention_metadata(
         self,
-        total_num_scheduled_tokens: int,
-        max_num_scheduled_tokens: int,
+        num_tokens: int,
         num_reqs: int,
+        max_query_len: int,
+        num_tokens_padded: int | None = None,
+        num_reqs_padded: int | None = None,
         ubatch_slices: UBatchSlices | None = None,
         logits_indices: torch.Tensor | None = None,
         use_spec_decode: bool = False,
         for_cudagraph_capture: bool = False,
-        scheduled_encoder_inputs: dict[str, list[int]] | None = None,
+        num_scheduled_tokens: dict[str, int] | None = None,
         cascade_attn_prefix_lens: list[list[int]] | None = None,
     ) -> tuple[PerLayerAttnMetadata, CommonAttentionMetadata | None]:
         """
         :return: tuple[attn_metadata, spec_decode_common_attn_metadata]
         """
+        num_tokens_padded = num_tokens_padded or num_tokens
+        num_reqs_padded = num_reqs_padded or num_reqs
+
         logits_indices_padded = None
-        num_logits_indices = 0
+        num_logits_indices = None
         if logits_indices is not None:
             num_logits_indices = logits_indices.size(0)
             if self.cache_config.kv_sharing_fast_prefill:
@@ -1328,27 +1534,15 @@ def _build_attention_metadata(
                 self.seq_lens.cpu[:num_reqs],
                 self.dcp_world_size,
                 self.dcp_rank,
-                self.parallel_config.dcp_kv_cache_interleave_size,
+                self.parallel_config.cp_kv_cache_interleave_size,
             )
-            self.dcp_local_seq_lens.copy_to_gpu(num_reqs)
+            self.dcp_local_seq_lens.cpu[num_reqs:].fill_(0)
+            self.dcp_local_seq_lens.copy_to_gpu(num_reqs_padded)
 
         attn_metadata: PerLayerAttnMetadata = {}
         if ubatch_slices is not None:
             attn_metadata = [dict() for _ in range(len(ubatch_slices))]
 
-        # Used in the below loop
-        query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
-        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs + 1]
-        seq_lens = self.seq_lens.gpu[:num_reqs]
-        seq_lens_cpu = self.seq_lens.cpu[:num_reqs]
-        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
-            :num_reqs
-        ]
-        dcp_local_seq_lens = (
-            self.dcp_local_seq_lens.gpu[:num_reqs] if self.dcp_world_size > 1 else None
-        )
-        spec_decode_common_attn_metadata = None
-
         if for_cudagraph_capture:
             # For some attention backends (e.g. FA) with sliding window models we need
             # to make sure the backend see a max_seq_len that is larger to the sliding
@@ -1364,38 +1558,55 @@ def _build_attention_metadata(
             self.num_accepted_tokens.np[num_reqs:].fill(1)
             self.num_accepted_tokens.copy_to_gpu()
 
+        # Used in the below loop, uses padded shapes
+        query_start_loc = self.query_start_loc.gpu[: num_reqs_padded + 1]
+        query_start_loc_cpu = self.query_start_loc.cpu[: num_reqs_padded + 1]
+        seq_lens = self.seq_lens.gpu[:num_reqs_padded]
+        seq_lens_cpu = self.seq_lens.cpu[:num_reqs_padded]
+        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
+            :num_reqs_padded
+        ]
+
+        dcp_local_seq_lens, dcp_local_seq_lens_cpu = None, None
+        if self.dcp_world_size > 1:
+            dcp_local_seq_lens = self.dcp_local_seq_lens.gpu[:num_reqs_padded]
+            dcp_local_seq_lens_cpu = self.dcp_local_seq_lens.cpu[:num_reqs_padded]
+
+        spec_decode_common_attn_metadata = None
+
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_gid, kv_cache_group in enumerate(
             self.kv_cache_config.kv_cache_groups
         ):
-            encoder_seq_lens = self._get_encoder_seq_lens(
-                scheduled_encoder_inputs or {},
+            encoder_seq_lens, encoder_seq_lens_cpu = self._get_encoder_seq_lens(
+                num_scheduled_tokens or {},
                 kv_cache_group.kv_cache_spec,
-                num_reqs,
+                num_reqs_padded,
             )
 
             if isinstance(kv_cache_group.kv_cache_spec, EncoderOnlyAttentionSpec):
                 # Encoder-only layers do not have KV cache, so we need to
                 # create a dummy block table and slot mapping for them.
                 blk_table_tensor = torch.zeros(
-                    (num_reqs, 1),
+                    (num_reqs_padded, 1),
                     dtype=torch.int32,
                     device=self.device,
                 )
                 slot_mapping = torch.zeros(
-                    (total_num_scheduled_tokens,),
+                    (num_tokens_padded,),
                     dtype=torch.int64,
                     device=self.device,
                 )
             else:
                 blk_table = self.input_batch.block_table[kv_cache_gid]
-                blk_table_tensor = blk_table.get_device_tensor(num_reqs)
-                slot_mapping = blk_table.slot_mapping.gpu[:total_num_scheduled_tokens]
+                blk_table_tensor = blk_table.get_device_tensor(num_reqs_padded)
+                slot_mapping = blk_table.slot_mapping.gpu[:num_tokens_padded]
 
                 # Fill unused with -1. Needed for reshape_and_cache in full cuda
-                # graph mode.
-                blk_table.slot_mapping.gpu[total_num_scheduled_tokens:].fill_(-1)
+                # graph mode. `blk_table_tensor` -1 to match mamba PAD_SLOT_ID
+                slot_mapping[num_tokens:num_tokens_padded].fill_(-1)
+                blk_table_tensor[num_reqs:num_reqs_padded].fill_(-1)
 
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=query_start_loc,
@@ -1403,9 +1614,9 @@ def _build_attention_metadata(
                 seq_lens=seq_lens,
                 seq_lens_cpu=seq_lens_cpu,
                 num_computed_tokens_cpu=num_computed_tokens_cpu,
-                num_reqs=num_reqs,
-                num_actual_tokens=total_num_scheduled_tokens,
-                max_query_len=max_num_scheduled_tokens,
+                num_actual_tokens=num_tokens_padded,
+                num_reqs=num_reqs_padded,
+                max_query_len=max_query_len,
                 max_seq_len=max_seq_len,
                 block_table_tensor=blk_table_tensor,
                 slot_mapping=slot_mapping,
@@ -1413,7 +1624,9 @@ def _build_attention_metadata(
                 num_logits_indices=num_logits_indices,
                 causal=True,
                 encoder_seq_lens=encoder_seq_lens,
+                encoder_seq_lens_cpu=encoder_seq_lens_cpu,
                 dcp_local_seq_lens=dcp_local_seq_lens,
+                dcp_local_seq_lens_cpu=dcp_local_seq_lens_cpu,
             )
 
             if self.speculative_config and spec_decode_common_attn_metadata is None:
@@ -1434,9 +1647,11 @@ def _build_attention_metadata(
                 extra_attn_metadata_args = {}
                 if use_spec_decode and isinstance(builder, GDNAttentionMetadataBuilder):
                     extra_attn_metadata_args = dict(
-                        num_accepted_tokens=self.num_accepted_tokens.gpu[:num_reqs],
+                        num_accepted_tokens=self.num_accepted_tokens.gpu[
+                            :num_reqs_padded
+                        ],
                         num_decode_draft_tokens_cpu=self.num_decode_draft_tokens.cpu[
-                            :num_reqs
+                            :num_reqs_padded
                         ],
                     )
 
@@ -1475,11 +1690,22 @@ def _build_attention_metadata(
                     for layer_name in attn_group.layer_names:
                         attn_metadata[layer_name] = attn_metadata_i
 
+        if spec_decode_common_attn_metadata is not None and (
+            num_reqs != num_reqs_padded or num_tokens != num_tokens_padded
+        ):
+            # Currently the drafter still only uses piecewise cudagraphs (and modifies
+            # the attention metadata in directly), and therefore does not want to use
+            # padded attention metadata.
+            spec_decode_common_attn_metadata = (
+                spec_decode_common_attn_metadata.unpadded(num_tokens, num_reqs)
+            )
+
         return attn_metadata, spec_decode_common_attn_metadata
 
     def _compute_cascade_attn_prefix_lens(
         self,
         num_scheduled_tokens: np.ndarray,
+        num_computed_tokens: np.ndarray,
         num_common_prefix_blocks: list[int],
     ) -> list[list[int]] | None:
         """
@@ -1502,6 +1728,7 @@ def _compute_cascade_attn_prefix_lens(
                     # 0 if cascade attention should not be used
                     cascade_attn_prefix_len = self._compute_cascade_attn_prefix_len(
                         num_scheduled_tokens,
+                        num_computed_tokens,
                         num_common_prefix_blocks[kv_cache_gid],
                         attn_group.kv_cache_spec,
                         attn_group.get_metadata_builder(),
@@ -1514,6 +1741,7 @@ def _compute_cascade_attn_prefix_lens(
     def _compute_cascade_attn_prefix_len(
         self,
         num_scheduled_tokens: np.ndarray,
+        num_computed_tokens: np.ndarray,
         num_common_prefix_blocks: int,
         kv_cache_spec: KVCacheSpec,
         attn_metadata_builder: AttentionMetadataBuilder,
@@ -1580,10 +1808,7 @@ def _compute_cascade_attn_prefix_len(
         # and the second kernel will get an empty input. While this is not
         # a fundamental problem, our current implementation does not support
         # this case.
-        num_reqs = len(num_scheduled_tokens)
-        common_prefix_len = min(
-            common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()
-        )
+        common_prefix_len = min(common_prefix_len, num_computed_tokens.min())
         # common_prefix_len should be a multiple of the block size.
         common_prefix_len = (
             common_prefix_len // kv_cache_spec.block_size * kv_cache_spec.block_size
@@ -1648,6 +1873,7 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
                 dst_start = mrope_pos_ptr
                 dst_end = mrope_pos_ptr + completion_part_len
 
+                assert req.mrope_position_delta is not None
                 MRotaryEmbedding.get_next_input_positions_tensor(
                     out=self.mrope_positions.np,
                     out_offset=dst_start,
@@ -1658,6 +1884,53 @@ def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
 
                 mrope_pos_ptr += completion_part_len
 
+    def _calc_xdrope_positions(self, scheduler_output: "SchedulerOutput"):
+        xdrope_pos_ptr = 0
+        for index, req_id in enumerate(self.input_batch.req_ids):
+            req = self.requests[req_id]
+            assert req.xdrope_positions is not None
+
+            num_computed_tokens = self.input_batch.num_computed_tokens_cpu[index]
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[req_id]
+            num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
+                req.prompt_token_ids, req.prompt_embeds
+            )
+
+            if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
+                prompt_part_len = max(0, num_prompt_tokens - num_computed_tokens)
+                completion_part_len = max(0, num_scheduled_tokens - prompt_part_len)
+            else:
+                prompt_part_len = num_scheduled_tokens
+                completion_part_len = 0
+
+            assert num_scheduled_tokens == prompt_part_len + completion_part_len
+
+            if prompt_part_len > 0:
+                # prompt's xdrope_positions are pre-computed
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + prompt_part_len
+                src_start = num_computed_tokens
+                src_end = num_computed_tokens + prompt_part_len
+
+                self.xdrope_positions.cpu[:, dst_start:dst_end] = req.xdrope_positions[
+                    :, src_start:src_end
+                ]
+                xdrope_pos_ptr += prompt_part_len
+
+            if completion_part_len > 0:
+                # compute completion's xdrope_positions on-the-fly
+                dst_start = xdrope_pos_ptr
+                dst_end = xdrope_pos_ptr + completion_part_len
+
+                XDRotaryEmbedding.get_next_input_positions_tensor(
+                    out=self.xdrope_positions.np,
+                    out_offset=dst_start,
+                    context_len=num_computed_tokens + prompt_part_len,
+                    num_new_tokens=completion_part_len,
+                )
+
+                xdrope_pos_ptr += completion_part_len
+
     def _calc_spec_decode_metadata(
         self,
         num_draft_tokens: np.ndarray,
@@ -1793,20 +2066,24 @@ def _batch_mm_kwargs_from_scheduler(
 
             for mm_input_id in encoder_input_ids:
                 mm_feature = req_state.mm_features[mm_input_id]
+                if mm_feature.data is None:
+                    continue
                 mm_hash = mm_feature.identifier
                 mm_kwargs.append(mm_feature.data)
                 mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
 
         return mm_kwargs, mm_hashes_pos
 
-    def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
+    def _execute_mm_encoder(
+        self, scheduler_output: "SchedulerOutput"
+    ) -> list[torch.Tensor]:
         # Batch the multi-modal inputs using the helper method.
         mm_kwargs, mm_hashes_pos = self._batch_mm_kwargs_from_scheduler(
             scheduler_output
         )
 
         if not mm_kwargs:
-            return
+            return []
 
         # Batch mm inputs as much as we can: if a request in the batch has
         # multiple modalities or a different modality than the previous one,
@@ -1816,7 +2093,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
         model = cast(SupportsMultiModal, self.model)
-        encoder_outputs = []
+        encoder_outputs: list[torch.Tensor] = []
         for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
             mm_kwargs,
             device=self.device,
@@ -1824,7 +2101,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             merge_by_field_config=model.merge_by_field_config,
             multimodal_cpu_fields=model.multimodal_cpu_fields,
         ):
-            curr_group_outputs = []
+            curr_group_outputs: list[torch.Tensor] = []
 
             # EVS-related change.
             # (ekhvedchenia): Temporary hack to limit peak memory usage when
@@ -1866,7 +2143,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
                 # 2. A list or tuple (length: num_items) of tensors,
                 # each of shape (feature_size, hidden_size) in case the feature
                 # size is dynamic depending on the input multimodal items.
-                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)
+                curr_group_outputs = model.embed_multimodal(**mm_kwargs_group)  # type: ignore[assignment]
 
             sanity_check_mm_encoder_outputs(
                 curr_group_outputs,
@@ -1883,6 +2160,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
             logger.debug("Finish execute for mm hash %s", mm_hash)
             self.maybe_save_ec_to_connector(self.encoder_cache, mm_hash)
 
+        return encoder_outputs
+
     def _gather_mm_embeddings(
         self,
         scheduler_output: "SchedulerOutput",
@@ -1896,6 +2175,7 @@ def _gather_mm_embeddings(
 
         req_start_idx = 0
         should_sync_mrope_positions = False
+        should_sync_xdrope_positions = False
 
         for req_id in self.input_batch.req_ids:
             mm_embeds_req: list[torch.Tensor] = []
@@ -1969,39 +2249,11 @@ def _gather_mm_embeddings(
             self._calc_mrope_positions(scheduler_output)
             self.mrope_positions.copy_to_gpu(total_num_scheduled_tokens)
 
-        return mm_embeds, is_mm_embed
-
-    def _extract_encoder_inputs(
-        self,
-        scheduler_output: "SchedulerOutput",
-    ) -> dict[str, torch.Tensor]:
-        """Extract encoder inputs for encoder-decoder models.
-
-        This method extracts multimodal input features from scheduled encoder
-        inputs and formats them for the encoder-decoder model forward pass.
-        """
-        # Batch the multi-modal inputs using the helper method.
-        mm_kwargs, _ = self._batch_mm_kwargs_from_scheduler(scheduler_output)
-
-        if not mm_kwargs:
-            return {}
-
-        # Group MM kwargs by modality and extract features
-        model = cast(SupportsMultiModal, self.model)
-        encoder_features = {}
-        for _, _, mm_kwargs_group in group_mm_kwargs_by_modality(
-            mm_kwargs,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            merge_by_field_config=model.merge_by_field_config,
-            multimodal_cpu_fields=model.multimodal_cpu_fields,
-        ):
-            # Add the grouped features to encoder_features dict
-            # This allows the model to receive them as kwargs (e.g.,
-            # input_features=...)
-            encoder_features.update(mm_kwargs_group)
+        if should_sync_xdrope_positions:
+            self._calc_xdrope_positions(scheduler_output)
+            self.xdrope_positions.copy_to_gpu(total_num_scheduled_tokens)
 
-        return encoder_features
+        return mm_embeds, is_mm_embed
 
     def get_model(self) -> nn.Module:
         # get raw model out of the cudagraph wrapper.
@@ -2031,7 +2283,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:
 
         supported_tasks = list(model.pooler.get_supported_tasks())
 
-        if self.scheduler_config.chunked_prefill_enabled:
+        if self.scheduler_config.enable_chunked_prefill:
             if "token_embed" in supported_tasks:
                 supported_tasks.remove("token_embed")
             if "token_classify" in supported_tasks:
@@ -2066,7 +2318,7 @@ def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
     def sync_and_slice_intermediate_tensors(
         self,
         num_tokens: int,
-        intermediate_tensors: IntermediateTensors,
+        intermediate_tensors: IntermediateTensors | None,
         sync_self: bool,
     ) -> IntermediateTensors:
         assert self.intermediate_tensors is not None
@@ -2110,19 +2362,6 @@ def eplb_step(self, is_dummy: bool = False, is_profile: bool = False) -> None:
             log_stats=self.parallel_config.eplb_config.log_balancedness,
         )
 
-    # This is where the second ubatch is adjusted to account for the padding.
-    # Should be called after attention metadata creation. This just pads
-    # the second ubatch slice out to the total number of tokens
-    # (num_tokens + padding)
-    @staticmethod
-    def pad_out_ubatch_slice(ubatch_slices: UBatchSlices, num_total_tokens: int):
-        padded_second_ubatch_slice = slice(
-            ubatch_slices[1].token_slice.start, num_total_tokens
-        )
-        ubatch_slices[1] = UBatchSlice(
-            padded_second_ubatch_slice, padded_second_ubatch_slice
-        )
-
     def _pool(
         self,
         hidden_states: torch.Tensor,
@@ -2167,18 +2406,7 @@ def _pool(
             pooler_output=pooler_output,
         )
 
-    def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
-        if (
-            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-            and hasattr(self, "cudagraph_batch_sizes")
-            and self.cudagraph_batch_sizes
-            and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]
-        ):
-            # Use CUDA graphs.
-            # Add padding to the batch size.
-            return self.vllm_config.pad_for_cudagraph(num_scheduled_tokens)
-
-        # Eager mode.
+    def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int:
         # Pad tokens to multiple of tensor_parallel_size when
         # enabled collective fusion for SP
         tp_size = self.vllm_config.parallel_config.tensor_parallel_size
@@ -2275,14 +2503,18 @@ def _preprocess(
             input_ids = self.input_ids.gpu[:num_input_tokens]
             inputs_embeds = None
             model_kwargs = self._init_model_kwargs(num_input_tokens)
+
         if self.uses_mrope:
             positions = self.mrope_positions.gpu[:, :num_input_tokens]
+        elif self.uses_xdrope_dim > 0:
+            positions = self.xdrope_positions.gpu[:, :num_input_tokens]
         else:
             positions = self.positions.gpu[:num_input_tokens]
 
         if is_first_rank:
             intermediate_tensors = None
         else:
+            assert intermediate_tensors is not None
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True
             )
@@ -2291,8 +2523,13 @@ def _preprocess(
             self.model_config.is_encoder_decoder
             and scheduler_output.scheduled_encoder_inputs
         ):
-            encoder_inputs = self._extract_encoder_inputs(scheduler_output)
-            model_kwargs.update(encoder_inputs)
+            # Run the encoder, just like we do with other multimodal inputs.
+            # For an encoder-decoder model, our processing here is a bit
+            # simpler, because the outputs are just passed to the decoder.
+            # We are not doing any prompt replacement. We also will only
+            # ever have a single encoder input.
+            encoder_outputs = self._execute_mm_encoder(scheduler_output)
+            model_kwargs.update({"encoder_outputs": encoder_outputs})
 
         return (
             input_ids,
@@ -2349,9 +2586,10 @@ def _bookkeeping_sync(
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
             num_nans_in_logits = self._get_nans_in_logits(logits)
 
-        discard_sampled_tokens_req_indices = self.discard_request_indices.np[
-            : self.num_discarded_requests
-        ]
+        num_reqs = self.input_batch.num_reqs
+        discard_sampled_tokens_req_indices = np.nonzero(
+            self.discard_request_mask.np[:num_reqs]
+        )[0]
         for i in discard_sampled_tokens_req_indices:
             gen = self.input_batch.generators.get(int(i))
             if gen is not None:
@@ -2364,32 +2602,38 @@ def _bookkeeping_sync(
 
         num_sampled_tokens = sampler_output.sampled_token_ids.shape[0]
         sampled_token_ids = sampler_output.sampled_token_ids
+        logprobs_tensors = sampler_output.logprobs_tensors
         invalid_req_indices = []
+        cu_num_tokens: list[int] | None = None
         if not self.use_async_scheduling:
             # Get the valid generated tokens.
             max_gen_len = sampled_token_ids.shape[-1]
             if max_gen_len == 1:
                 # No spec decode tokens.
                 valid_sampled_token_ids = self._to_list(sampled_token_ids)
+                # Mask out the sampled tokens that should not be sampled.
+                for i in discard_sampled_tokens_req_indices:
+                    valid_sampled_token_ids[int(i)].clear()
             else:
                 # Includes spec decode tokens.
-                valid_sampled_token_ids = self.rejection_sampler.parse_output(
+                valid_sampled_token_ids, cu_num_tokens = RejectionSampler.parse_output(
                     sampled_token_ids,
                     self.input_batch.vocab_size,
+                    discard_sampled_tokens_req_indices,
+                    return_cu_num_tokens=logprobs_tensors is not None,
                 )
-            # Mask out the sampled tokens that should not be sampled.
-            for i in discard_sampled_tokens_req_indices:
-                valid_sampled_token_ids[int(i)].clear()
         else:
             valid_sampled_token_ids = []
             invalid_req_indices = discard_sampled_tokens_req_indices.tolist()
             invalid_req_indices_set = set(invalid_req_indices)
-            assert sampled_token_ids.shape[-1] == 1
 
             # Cache the sampled tokens on the GPU and avoid CPU sync.
             # These will be copied into input_ids in the next step
             # when preparing inputs.
-            self.input_batch.prev_sampled_token_ids = sampled_token_ids
+            # With spec decoding, this is done in propose_draft_token_ids().
+            if self.input_batch.prev_sampled_token_ids is None:
+                assert sampled_token_ids.shape[-1] == 1
+                self.input_batch.prev_sampled_token_ids = sampled_token_ids
             self.input_batch.prev_req_id_to_index = {
                 req_id: i
                 for i, req_id in enumerate(self.input_batch.req_ids)
@@ -2402,10 +2646,6 @@ def _bookkeeping_sync(
         # the sampled tokens back, because there's no direct communication
         # between the first-stage worker and the last-stage worker.
         req_ids = self.input_batch.req_ids
-        logprobs_tensors = sampler_output.logprobs_tensors
-        cu_num_accepted_tokens = (
-            [0] if spec_decode_metadata and logprobs_tensors else None
-        )
         for req_idx in range(num_sampled_tokens):
             if self.use_async_scheduling:
                 sampled_ids = [-1] if req_idx not in invalid_req_indices_set else None
@@ -2414,11 +2654,6 @@ def _bookkeeping_sync(
 
             num_sampled_ids: int = len(sampled_ids) if sampled_ids else 0
 
-            if cu_num_accepted_tokens is not None:
-                cu_num_accepted_tokens.append(
-                    cu_num_accepted_tokens[-1] + num_sampled_ids
-                )
-
             if not sampled_ids:
                 continue
 
@@ -2440,7 +2675,7 @@ def _bookkeeping_sync(
             req_state.output_token_ids.extend(sampled_ids)
 
         logprobs_lists = (
-            logprobs_tensors.tolists(cu_num_accepted_tokens)
+            logprobs_tensors.tolists(cu_num_tokens)
             if not self.use_async_scheduling and logprobs_tensors is not None
             else None
         )
@@ -2508,6 +2743,87 @@ def _model_forward(
             **model_kwargs,
         )
 
+    def _determine_batch_execution_and_padding(
+        self,
+        num_tokens: int,
+        num_reqs: int,
+        num_scheduled_tokens_np: np.ndarray,
+        max_num_scheduled_tokens: int,
+        use_cascade_attn: bool,
+        allow_microbatching: bool = True,
+        force_eager: bool = False,
+        # For cudagraph capture TODO(lucas): Refactor how we capture cudagraphs (will
+        # be improved in model runner v2)
+        force_uniform_decode: bool | None = None,
+        force_has_lora: bool | None = None,
+    ) -> tuple[
+        CUDAGraphMode, BatchDescriptor, UBatchSlices | None, torch.Tensor | None
+    ]:
+        num_tokens_padded = self._pad_for_sequence_parallelism(num_tokens)
+        uniform_decode = (
+            (
+                (max_num_scheduled_tokens == self.uniform_decode_query_len)
+                and (num_tokens_padded == max_num_scheduled_tokens * num_reqs)
+            )
+            if force_uniform_decode is None
+            else force_uniform_decode
+        )
+
+        has_lora = (
+            len(self.input_batch.lora_id_to_lora_request) > 0
+            if force_has_lora is None
+            else force_has_lora
+        )
+
+        dispatch_cudagraph = (
+            lambda num_tokens: self.cudagraph_dispatcher.dispatch(
+                num_tokens=num_tokens,
+                has_lora=has_lora,
+                use_cascade_attn=use_cascade_attn,
+                uniform_decode=uniform_decode,
+            )
+            if not force_eager
+            else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded))
+        )
+
+        cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+        num_tokens_padded = batch_descriptor.num_tokens
+
+        # Extra coordination when running data-parallel since we need to coordinate
+        # across ranks
+        ubatch_slices, num_tokens_across_dp = None, None
+        if self.vllm_config.parallel_config.data_parallel_size > 1:
+            # Disable DP padding when running eager to avoid excessive padding when
+            # running prefills. This lets us set cudagraph_mode="NONE" on the prefiller
+            # in a P/D setup and still use CUDA graphs (enabled by this padding) on the
+            # decoder.
+            allow_dp_padding = (
+                self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
+            )
+
+            ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
+                num_tokens_unpadded=num_tokens_padded,
+                parallel_config=self.parallel_config,
+                allow_microbatching=allow_microbatching,
+                allow_dp_padding=allow_dp_padding,
+                num_tokens_padded=num_tokens_padded,
+                uniform_decode=uniform_decode,
+                num_scheduled_tokens_per_request=num_scheduled_tokens_np,
+            )
+
+            # Extract DP padding if there is any
+            if num_tokens_across_dp is not None:
+                dp_rank = self.parallel_config.data_parallel_rank
+                num_tokens_padded = int(num_tokens_across_dp[dp_rank].item())
+
+                # Re-dispatch with DP padding
+                cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded)
+                # Assert to make sure the agreed upon token count is correct otherwise
+                # num_tokens_across_dp will no-longer be valid
+                assert batch_descriptor.num_tokens == num_tokens_padded
+
+        return cudagraph_mode, batch_descriptor, ubatch_slices, num_tokens_across_dp
+
     @torch.inference_mode()
     def execute_model(
         self,
@@ -2519,6 +2835,21 @@ def execute_model(
                 "State error: sample_tokens() must be called "
                 "after execute_model() returns None."
             )
+
+        # self._draft_token_ids is None when `input_fits_in_drafter=False`
+        # and there is no draft tokens scheduled. so it need to update the
+        # spec_decoding info in scheduler_output with async_scheduling.
+        # use deepcopy to avoid the modification has influence on the
+        # scheduler_output in engine core process.
+        # TODO(Ronald1995): deepcopy is expensive when there is a large
+        # number of requests, optimize it later.
+        if (
+            self.use_async_scheduling
+            and self.num_spec_tokens
+            and self._draft_token_ids is None
+        ):
+            scheduler_output = deepcopy(scheduler_output)
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         with record_function_or_nullcontext("gpu_model_runner: preprocess"):
             with self.synchronize_input_prep():
@@ -2534,6 +2865,18 @@ def execute_model(
                         return make_empty_encoder_model_runner_output(scheduler_output)
 
                 if not num_scheduled_tokens:
+                    if (
+                        self.parallel_config.distributed_executor_backend
+                        == "external_launcher"
+                        and self.parallel_config.data_parallel_size > 1
+                    ):
+                        # this is a corner case when both external launcher
+                        # and DP are enabled, num_scheduled_tokens could be
+                        # 0, and has_unfinished_requests in the outer loop
+                        # returns True. before returning early here we call
+                        # dummy run to ensure coordinate_batch_across_dp
+                        # is called into to avoid out of sync issues.
+                        self._dummy_run(1)
                     if not has_kv_transfer_group():
                         # Return empty ModelRunnerOutput if no work to do.
                         return EMPTY_MODEL_RUNNER_OUTPUT
@@ -2541,7 +2884,7 @@ def execute_model(
                         scheduler_output, self.vllm_config
                     )
                 if self.cache_config.kv_sharing_fast_prefill:
-                    assert not self.input_batch.num_prompt_logprobs, (
+                    assert not self.num_prompt_logprobs, (
                         "--kv-sharing-fast-prefill produces incorrect "
                         "logprobs for prompt tokens, tokens, please disable "
                         "it when the requests need prompt logprobs"
@@ -2552,56 +2895,71 @@ def execute_model(
                 tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
                 num_scheduled_tokens_np = np.array(tokens, dtype=np.int32)
                 max_num_scheduled_tokens = int(num_scheduled_tokens_np.max())
+                num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
 
                 (
                     logits_indices,
                     spec_decode_metadata,
-                    ubatch_slices,
-                    num_tokens_across_dp,
                 ) = self._prepare_inputs(
-                    scheduler_output, num_scheduled_tokens_np, max_num_scheduled_tokens
+                    scheduler_output,
+                    num_scheduled_tokens_np,
                 )
 
                 cascade_attn_prefix_lens = None
                 # Disable cascade attention when using microbatching (DBO)
-                if self.cascade_attn_enabled and ubatch_slices is None:
+                if self.cascade_attn_enabled and not self.parallel_config.enable_dbo:
                     # Pre-compute cascade attention prefix lengths
-                    # NOTE: Must be AFTER _prepare_inputs uses self.input_batch state
                     cascade_attn_prefix_lens = self._compute_cascade_attn_prefix_lens(
                         num_scheduled_tokens_np,
+                        self.input_batch.num_computed_tokens_cpu[:num_reqs],
                         scheduler_output.num_common_prefix_blocks,
                     )
 
-                # TODO(lucas): move cudagraph dispatching here:
-                #   https://github.com/vllm-project/vllm/issues/23789
+                (
+                    cudagraph_mode,
+                    batch_desc,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                ) = self._determine_batch_execution_and_padding(
+                    num_tokens=num_tokens_unpadded,
+                    num_reqs=num_reqs,
+                    num_scheduled_tokens_np=num_scheduled_tokens_np,
+                    max_num_scheduled_tokens=max_num_scheduled_tokens,
+                    use_cascade_attn=cascade_attn_prefix_lens is not None,
+                )
+
+                logger.debug(
+                    "Running batch with cudagraph_mode: %s, batch_descriptor: %s, "
+                    "ubatch_slices: %s, num_tokens_across_dp: %s",
+                    cudagraph_mode,
+                    batch_desc,
+                    ubatch_slices,
+                    num_tokens_across_dp,
+                )
+
+                num_tokens_padded = batch_desc.num_tokens
+                num_reqs_padded = (
+                    batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
+                )
 
-                total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
                 use_spec_decode = len(scheduler_output.scheduled_spec_decode_tokens) > 0
-                attn_metadata, spec_decode_common_attn_metadata = (
+                pad_attn = cudagraph_mode == CUDAGraphMode.FULL
+
+                (attn_metadata, spec_decode_common_attn_metadata) = (
                     self._build_attention_metadata(
-                        total_num_scheduled_tokens=total_num_scheduled_tokens,
-                        max_num_scheduled_tokens=max_num_scheduled_tokens,
+                        num_tokens=num_tokens_unpadded,
+                        num_tokens_padded=num_tokens_padded if pad_attn else None,
                         num_reqs=num_reqs,
+                        num_reqs_padded=num_reqs_padded if pad_attn else None,
+                        max_query_len=max_num_scheduled_tokens,
                         ubatch_slices=ubatch_slices,
                         logits_indices=logits_indices,
                         use_spec_decode=use_spec_decode,
-                        scheduled_encoder_inputs=scheduler_output.scheduled_encoder_inputs,
+                        num_scheduled_tokens=scheduler_output.num_scheduled_tokens,
                         cascade_attn_prefix_lens=cascade_attn_prefix_lens,
                     )
                 )
 
-            dp_rank = self.parallel_config.data_parallel_rank
-            if ubatch_slices:
-                assert num_tokens_across_dp is not None
-                num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-                self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
-            elif num_tokens_across_dp is not None:
-                num_input_tokens = int(num_tokens_across_dp[dp_rank].item())
-            else:
-                num_input_tokens = self._get_num_input_tokens(
-                    scheduler_output.total_num_scheduled_tokens
-                )
-
             (
                 input_ids,
                 inputs_embeds,
@@ -2610,29 +2968,14 @@ def execute_model(
                 model_kwargs,
                 ec_connector_output,
             ) = self._preprocess(
-                scheduler_output, num_input_tokens, intermediate_tensors
-            )
-
-            uniform_decode = (
-                max_num_scheduled_tokens == self.uniform_decode_query_len
-            ) and (num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-            batch_descriptor = BatchDescriptor(
-                num_tokens=num_input_tokens,
-                uniform_decode=uniform_decode,
-                has_lora=len(self.input_batch.lora_id_to_lora_request) > 0,
-            )
-            cudagraph_runtime_mode, batch_descriptor = (
-                self.cudagraph_dispatcher.dispatch(
-                    batch_descriptor,
-                    use_cascade_attn=cascade_attn_prefix_lens is not None,
-                )
+                scheduler_output, num_tokens_padded, intermediate_tensors
             )
 
         # Set cudagraph mode to none if calc_kv_scales is true.
         # KV scales calculation involves dynamic operations that are incompatible
         # with CUDA graph capture.
         if self.calculate_kv_scales:
-            cudagraph_runtime_mode = CUDAGraphMode.NONE
+            cudagraph_mode = CUDAGraphMode.NONE
             # Mark KV scales as calculated after the first forward pass
             self.calculate_kv_scales = False
 
@@ -2642,10 +2985,10 @@ def execute_model(
             set_forward_context(
                 attn_metadata,
                 self.vllm_config,
-                num_tokens=num_input_tokens,
+                num_tokens=num_tokens_padded,
                 num_tokens_across_dp=num_tokens_across_dp,
-                cudagraph_runtime_mode=cudagraph_runtime_mode,
-                batch_descriptor=batch_descriptor,
+                cudagraph_runtime_mode=cudagraph_mode,
+                batch_descriptor=batch_desc,
                 ubatch_slices=ubatch_slices,
             ),
             record_function_or_nullcontext("gpu_model_runner: forward"),
@@ -2674,6 +3017,7 @@ def execute_model(
                     # Return the intermediate tensors.
                     assert isinstance(hidden_states, IntermediateTensors)
                     hidden_states.kv_connector_output = kv_connector_output
+                    self.kv_connector_output = kv_connector_output
                     return hidden_states
 
                 if self.is_pooling_model:
@@ -2694,7 +3038,7 @@ def execute_model(
                 if not get_pp_group().is_last_rank:
                     all_gather_tensors = {
                         "residual": not is_residual_scattered_for_sp(
-                            self.vllm_config, num_input_tokens
+                            self.vllm_config, num_tokens_padded
                         )
                     }
                     get_pp_group().send_tensor_dict(
@@ -2706,15 +3050,15 @@ def execute_model(
                 else:
                     logits = self.model.compute_logits(sample_hidden_states)
 
-                model_output_broadcast_data = {}
+                model_output_broadcast_data: dict[str, Any] = {}
                 if logits is not None:
                     model_output_broadcast_data["logits"] = logits.contiguous()
 
-                model_output_broadcast_data = get_pp_group().broadcast_tensor_dict(
+                broadcasted = get_pp_group().broadcast_tensor_dict(
                     model_output_broadcast_data, src=len(get_pp_group().ranks) - 1
                 )
-                assert model_output_broadcast_data is not None
-                logits = model_output_broadcast_data["logits"]
+                assert broadcasted is not None
+                logits = broadcasted["logits"]
 
         self.execute_model_state = ExecuteModelState(
             scheduler_output,
@@ -2724,18 +3068,31 @@ def execute_model(
             hidden_states,
             sample_hidden_states,
             aux_hidden_states,
-            kv_connector_output,
             ec_connector_output,
         )
+        self.kv_connector_output = kv_connector_output
         return None
 
     @torch.inference_mode
     def sample_tokens(
         self, grammar_output: "GrammarOutput | None"
     ) -> ModelRunnerOutput | AsyncModelRunnerOutput | IntermediateTensors:
+        kv_connector_output = self.kv_connector_output
+        self.kv_connector_output = None
+
         if self.execute_model_state is None:
             # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # noqa
+            if not kv_connector_output:
+                return None  # type: ignore[return-value]
+
+            # In case of PP with kv transfer, we need to pass through the
+            # kv_connector_output
+            if kv_connector_output.is_empty():
+                return EMPTY_MODEL_RUNNER_OUTPUT
+
+            output = copy(EMPTY_MODEL_RUNNER_OUTPUT)
+            output.kv_connector_output = kv_connector_output
+            return output
 
         # Unpack ephemeral state.
         (
@@ -2746,7 +3103,6 @@ def sample_tokens(
             hidden_states,
             sample_hidden_states,
             aux_hidden_states,
-            kv_connector_output,
             ec_connector_output,
         ) = self.execute_model_state
         # Clear ephemeral state.
@@ -2761,6 +3117,8 @@ def sample_tokens(
         with record_function_or_nullcontext("gpu_model_runner: sample"):
             sampler_output = self._sample(logits, spec_decode_metadata)
 
+        self.input_batch.prev_sampled_token_ids = None
+
         def propose_draft_token_ids(sampled_token_ids):
             assert spec_decode_common_attn_metadata is not None
             with record_function_or_nullcontext("gpu_model_runner: draft"):
@@ -2775,31 +3133,49 @@ def propose_draft_token_ids(sampled_token_ids):
                     spec_decode_common_attn_metadata,
                 )
 
+        spec_config = self.speculative_config
         use_padded_batch_for_eagle = (
-            self.speculative_config
-            and self.speculative_config.use_eagle()
-            and not self.speculative_config.disable_padded_drafter_batch
+            spec_config is not None
+            and spec_config.use_eagle()
+            and not spec_config.disable_padded_drafter_batch
         )
         effective_drafter_max_model_len = self.max_model_len
         if effective_drafter_max_model_len is None:
             effective_drafter_max_model_len = self.model_config.max_model_len
         if (
-            self.speculative_config
-            and self.speculative_config.draft_model_config is not None
-            and self.speculative_config.draft_model_config.max_model_len is not None
+            spec_config is not None
+            and spec_config.draft_model_config is not None
+            and spec_config.draft_model_config.max_model_len is not None
         ):
             effective_drafter_max_model_len = (
-                self.speculative_config.draft_model_config.max_model_len
+                spec_config.draft_model_config.max_model_len
             )
         input_fits_in_drafter = spec_decode_common_attn_metadata and (
-            spec_decode_common_attn_metadata.max_seq_len
-            + self.speculative_config.num_speculative_tokens
+            spec_decode_common_attn_metadata.max_seq_len + self.num_spec_tokens
             <= effective_drafter_max_model_len
         )
-        if use_padded_batch_for_eagle and input_fits_in_drafter:
-            # EAGLE speculative decoding can use the GPU sampled tokens
-            # as inputs, and does not need to wait for bookkeeping to finish.
-            propose_draft_token_ids(sampler_output.sampled_token_ids)
+        if use_padded_batch_for_eagle:
+            assert self.speculative_config is not None
+            assert isinstance(self.drafter, EagleProposer)
+            sampled_token_ids = sampler_output.sampled_token_ids
+            if input_fits_in_drafter:
+                # EAGLE speculative decoding can use the GPU sampled tokens
+                # as inputs, and does not need to wait for bookkeeping to finish.
+                propose_draft_token_ids(sampled_token_ids)
+            elif self.valid_sampled_token_count_event is not None:
+                assert spec_decode_common_attn_metadata is not None
+                next_token_ids, valid_sampled_tokens_count = (
+                    self.drafter.prepare_next_token_ids_padded(
+                        spec_decode_common_attn_metadata,
+                        sampled_token_ids,
+                        self.requests,
+                        self.input_batch,
+                        self.discard_request_mask.gpu,
+                    )
+                )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
 
         with record_function_or_nullcontext("gpu_model_runner: bookkeep"):
             (
@@ -2856,12 +3232,13 @@ def propose_draft_token_ids(sampled_token_ids):
                 logprobs_tensors=sampler_output.logprobs_tensors,
                 invalid_req_indices=invalid_req_indices,
                 async_output_copy_stream=self.async_output_copy_stream,
+                vocab_size=self.input_batch.vocab_size,
             )
         with record_function_or_nullcontext(
             "gpu_model_runner: set_async_sampled_token_ids"
         ):
             # Save ref of sampled_token_ids CPU tensor if the batch contains
-            # any requests with sampling params that that require output ids.
+            # any requests with sampling params that require output ids.
             self.input_batch.set_async_sampled_token_ids(
                 async_output.sampled_token_ids_cpu,
                 async_output.async_copy_ready_event,
@@ -2880,6 +3257,37 @@ def take_draft_token_ids(self) -> DraftTokenIds | None:
         self._draft_token_ids = None
         return DraftTokenIds(req_ids, draft_token_ids)
 
+    def _copy_valid_sampled_token_count(
+        self, next_token_ids: torch.Tensor, valid_sampled_tokens_count: torch.Tensor
+    ) -> None:
+        if self.valid_sampled_token_count_event is None:
+            return
+
+        default_stream = torch.cuda.current_stream()
+        # Initialize a new stream to overlap the copy operation with
+        # prepare_input of draft model.
+        with torch.cuda.stream(self.valid_sampled_token_count_copy_stream):
+            self.valid_sampled_token_count_copy_stream.wait_stream(default_stream)  # type: ignore
+            counts = valid_sampled_tokens_count
+            counts_cpu = self.valid_sampled_token_count_cpu
+            counts_cpu[: counts.shape[0]].copy_(counts, non_blocking=True)
+            self.valid_sampled_token_count_event.record()
+
+        self.input_batch.prev_sampled_token_ids = next_token_ids.unsqueeze(1)
+
+    def _get_valid_sampled_token_count(self) -> list[int]:
+        # Wait until valid_sampled_tokens_count is copied to cpu,
+        prev_sampled_token_ids = self.input_batch.prev_sampled_token_ids
+        if (
+            self.valid_sampled_token_count_event is None
+            or prev_sampled_token_ids is None
+        ):
+            return []
+
+        counts_cpu = self.valid_sampled_token_count_cpu
+        self.valid_sampled_token_count_event.synchronize()
+        return counts_cpu[: prev_sampled_token_ids.shape[0]].tolist()
+
     def propose_draft_token_ids(
         self,
         scheduler_output: "SchedulerOutput",
@@ -2892,7 +3300,9 @@ def propose_draft_token_ids(
         common_attn_metadata: CommonAttentionMetadata,
     ) -> list[list[int]] | torch.Tensor:
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        if self.speculative_config.method == "ngram":
+        spec_config = self.speculative_config
+        assert spec_config is not None
+        if spec_config.method == "ngram":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, NgramProposer)
             draft_token_ids = self.drafter.propose(
@@ -2902,11 +3312,11 @@ def propose_draft_token_ids(
                 self.input_batch.token_ids_cpu,
                 self.input_batch.spec_decode_unsupported_reqs,
             )
-        elif self.speculative_config.method == "suffix":
+        elif spec_config.method == "suffix":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, SuffixDecodingProposer)
             draft_token_ids = self.drafter.propose(self.input_batch, sampled_token_ids)
-        elif self.speculative_config.method == "medusa":
+        elif spec_config.method == "medusa":
             assert isinstance(sampled_token_ids, list)
             assert isinstance(self.drafter, MedusaProposer)
 
@@ -2931,10 +3341,10 @@ def propose_draft_token_ids(
                 target_hidden_states=hidden_states,
                 sampling_metadata=sampling_metadata,
             )
-        elif self.speculative_config.use_eagle():
+        elif spec_config.use_eagle():
             assert isinstance(self.drafter, EagleProposer)
 
-            if self.speculative_config.disable_padded_drafter_batch:
+            if spec_config.disable_padded_drafter_batch:
                 # When padded-batch is disabled, the sampled_token_ids should be
                 # the cpu-side list[list[int]] of valid sampled tokens for each
                 # request, with invalid requests having empty lists.
@@ -2963,10 +3373,12 @@ def propose_draft_token_ids(
                         sampled_token_ids,
                         self.requests,
                         self.input_batch,
-                        self.discard_request_indices.gpu,
-                        self.num_discarded_requests,
+                        self.discard_request_mask.gpu,
                     )
                 )
+                self._copy_valid_sampled_token_count(
+                    next_token_ids, valid_sampled_tokens_count
+                )
 
             if spec_decode_metadata is None:
                 token_indices_to_sample = None
@@ -2981,31 +3393,41 @@ def propose_draft_token_ids(
                 else:
                     target_hidden_states = hidden_states[:num_scheduled_tokens]
             else:
-                if self.speculative_config.disable_padded_drafter_batch:
+                if spec_config.disable_padded_drafter_batch:
                     token_indices_to_sample = None
                     common_attn_metadata, token_indices = self.drafter.prepare_inputs(
                         common_attn_metadata,
                         sampled_token_ids,
                         spec_decode_metadata.num_draft_tokens,
                     )
+                    target_token_ids = self.input_ids.gpu[token_indices]
+                    target_positions = self._get_positions(token_indices)
+                    if self.use_aux_hidden_state_outputs:
+                        assert aux_hidden_states is not None
+                        target_hidden_states = torch.cat(
+                            [h[token_indices] for h in aux_hidden_states], dim=-1
+                        )
+                    else:
+                        target_hidden_states = hidden_states[token_indices]
                 else:
-                    common_attn_metadata, token_indices, token_indices_to_sample = (
+                    common_attn_metadata, token_indices_to_sample = (
                         self.drafter.prepare_inputs_padded(
                             common_attn_metadata,
                             spec_decode_metadata,
                             valid_sampled_tokens_count,
                         )
                     )
-
-                target_token_ids = self.input_ids.gpu[token_indices]
-                target_positions = self._get_positions(token_indices)
-                if self.use_aux_hidden_state_outputs:
-                    assert aux_hidden_states is not None
-                    target_hidden_states = torch.cat(
-                        [h[token_indices] for h in aux_hidden_states], dim=-1
-                    )
-                else:
-                    target_hidden_states = hidden_states[token_indices]
+                    total_num_tokens = common_attn_metadata.num_actual_tokens
+                    # When padding the batch, token_indices is just a range
+                    target_token_ids = self.input_ids.gpu[:total_num_tokens]
+                    target_positions = self._get_positions(total_num_tokens)
+                    if self.use_aux_hidden_state_outputs:
+                        assert aux_hidden_states is not None
+                        target_hidden_states = torch.cat(
+                            [h[:total_num_tokens] for h in aux_hidden_states], dim=-1
+                        )
+                    else:
+                        target_hidden_states = hidden_states[:total_num_tokens]
 
             if self.supports_mm_inputs:
                 mm_embed_inputs = self._gather_mm_embeddings(
@@ -3076,9 +3498,12 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                     and is_mixture_of_experts(self.drafter.model)
                     and self.parallel_config.enable_eplb
                 ):
+                    spec_config = self.vllm_config.speculative_config
+                    assert spec_config is not None
+                    assert spec_config.draft_model_config is not None
                     logger.info_once(
                         "EPLB is enabled for drafter model %s.",
-                        self.vllm_config.speculative_config.draft_model_config.model,
+                        spec_config.draft_model_config.model,
                     )
 
                     global_expert_load = (
@@ -3095,7 +3520,7 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                         self.eplb_state = EplbState(self.parallel_config, self.device)
                     self.eplb_state.add_model(
                         self.drafter.model,
-                        self.vllm_config.speculative_config.draft_model_config,
+                        spec_config.draft_model_config,
                         global_expert_load,
                         old_global_expert_indices,
                         rank_mapping,
@@ -3130,9 +3555,15 @@ def load_model(self, eep_scale_up: bool = False) -> None:
             scope="local",
         )
         prepare_communication_buffer_for_model(self.model)
+        if (drafter := getattr(self, "drafter", None)) and (
+            drafter_model := getattr(drafter, "model", None)
+        ):
+            prepare_communication_buffer_for_model(drafter_model)
+        mm_config = self.model_config.multimodal_config
         self.is_multimodal_pruning_enabled = (
             supports_multimodal_pruning(self.get_model())
-            and self.model_config.multimodal_config.is_multimodal_pruning_enabled()
+            and mm_config is not None
+            and mm_config.is_multimodal_pruning_enabled()
         )
 
         if is_mixture_of_experts(self.model) and self.parallel_config.enable_eplb:
@@ -3153,6 +3584,8 @@ def load_model(self, eep_scale_up: bool = False) -> None:
                 old_global_expert_indices,
                 rank_mapping,
             )
+            if self.eplb_state.is_async:
+                self.eplb_state.start_async_loop(rank_mapping=rank_mapping)
 
         if (
             self.vllm_config.compilation_config.mode
@@ -3167,15 +3600,14 @@ def load_model(self, eep_scale_up: bool = False) -> None:
         # CudagraphWraper and CudagraphDispatcher of vllm.
 
         # wrap the model with full cudagraph wrapper if needed.
-        if (
-            self.compilation_config.cudagraph_mode.has_full_cudagraphs()
-            and not self.parallel_config.enable_dbo
-        ):
+        cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
+        if cudagraph_mode.has_full_cudagraphs() and not self.parallel_config.enable_dbo:
             self.model = CUDAGraphWrapper(
                 self.model, self.vllm_config, runtime_mode=CUDAGraphMode.FULL
             )
         elif self.parallel_config.enable_dbo:
-            if self.compilation_config.cudagraph_mode.has_full_cudagraphs():
+            if cudagraph_mode.has_full_cudagraphs():
                 self.model = UBatchWrapper(
                     self.model, self.vllm_config, CUDAGraphMode.FULL, self.device
                 )
@@ -3231,7 +3663,7 @@ def _get_prompt_logprobs_dict(
         hidden_states: torch.Tensor,
         num_scheduled_tokens: dict[str, int],
     ) -> dict[str, LogprobsTensors | None]:
-        num_prompt_logprobs_dict = self.input_batch.num_prompt_logprobs
+        num_prompt_logprobs_dict = self.num_prompt_logprobs
         if not num_prompt_logprobs_dict:
             return {}
 
@@ -3242,7 +3674,10 @@ def _get_prompt_logprobs_dict(
         # maintainable loop over optimal performance.
         completed_prefill_reqs = []
         for req_id, num_prompt_logprobs in num_prompt_logprobs_dict.items():
-            num_tokens = num_scheduled_tokens[req_id]
+            num_tokens = num_scheduled_tokens.get(req_id)
+            if num_tokens is None:
+                # This can happen if the request was preempted in prefill stage.
+                continue
 
             # Get metadata for this request.
             request = self.requests[req_id]
@@ -3423,6 +3858,7 @@ def _dummy_run(
         create_mixed_batch: bool = False,
         remove_lora: bool = True,
         activate_lora: bool = False,
+        is_graph_capturing: bool = False,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Run a dummy forward pass to warm up/profile run or capture the
@@ -3499,27 +3935,44 @@ def _dummy_run(
         assert sum(num_scheduled_tokens_list) == num_tokens
         assert len(num_scheduled_tokens_list) == num_reqs
         num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
-        total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
+        num_tokens_unpadded = int(num_scheduled_tokens.sum())
+
         num_sampled_tokens = np.ones(num_reqs, dtype=np.int32)
 
-        # Disable DP padding when running eager
-        allow_dp_padding = self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
-
-        # We currently only microbatch if the number of tokens is
-        # over a certain threshold.
-        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-            num_tokens_unpadded=total_num_scheduled_tokens,
-            parallel_config=self.vllm_config.parallel_config,
-            allow_microbatching=allow_microbatching,
-            allow_dp_padding=allow_dp_padding,
-            num_tokens_padded=total_num_scheduled_tokens,
-            uniform_decode=uniform_decode,
-            num_scheduled_tokens_per_request=num_scheduled_tokens,
+        _cudagraph_mode, batch_desc, ubatch_slices, num_tokens_across_dp = (
+            self._determine_batch_execution_and_padding(
+                num_tokens=num_tokens_unpadded,
+                num_reqs=num_reqs,
+                num_scheduled_tokens_np=num_scheduled_tokens,
+                max_num_scheduled_tokens=max_query_len,
+                use_cascade_attn=False,
+                allow_microbatching=allow_microbatching,
+                force_eager=is_profile
+                or (cudagraph_runtime_mode == CUDAGraphMode.NONE),
+                # `force_uniform_decode` is used for cudagraph capture; because for
+                # capturing mixed prefill-decode batches, we sometimes use
+                # num_tokens == num_reqs which looks like a uniform decode batch to the
+                # dispatcher; but we actually want to capture a piecewise cudagraph
+                force_uniform_decode=uniform_decode,
+                # `force_has_lora` is used for cudagraph capture; because LoRA is
+                # activated later in the context manager, but we need to know the
+                # LoRA state when determining the batch descriptor for capture
+                force_has_lora=activate_lora,
+            )
+        )
+
+        if cudagraph_runtime_mode is None:
+            cudagraph_runtime_mode = _cudagraph_mode
+        else:
+            assert cudagraph_runtime_mode == _cudagraph_mode, (
+                f"Cudagraph runtime mode mismatch in dummy_run. "
+                f"Expected {_cudagraph_mode}, but got {cudagraph_runtime_mode}."
+            )
+
+        num_tokens_padded = batch_desc.num_tokens
+        num_reqs_padded = (
+            batch_desc.num_reqs if batch_desc.num_reqs is not None else num_reqs
         )
-        num_tokens_after_padding = num_tokens
-        if num_tokens_across_dp is not None:
-            dp_rank = self.parallel_config.data_parallel_rank
-            num_tokens_after_padding = int(num_tokens_across_dp[dp_rank])
 
         attn_metadata: PerLayerAttnMetadata | None = None
 
@@ -3532,7 +3985,7 @@ def _dummy_run(
                 # TODO(luka) better system for describing dummy batches
                 seq_lens = [1] * num_decode_tokens + [num_prefill_tokens + 1]
             else:
-                seq_lens = max_query_len
+                seq_lens = max_query_len  # type: ignore[assignment]
             self.seq_lens.np[:num_reqs] = seq_lens
             self.seq_lens.np[num_reqs:] = 0
             self.seq_lens.copy_to_gpu()
@@ -3542,9 +3995,9 @@ def _dummy_run(
             self.query_start_loc.copy_to_gpu()
 
             attn_metadata, _ = self._build_attention_metadata(
-                total_num_scheduled_tokens=num_tokens,
-                max_num_scheduled_tokens=max_query_len,
-                num_reqs=num_reqs,
+                num_tokens=num_tokens_unpadded,
+                num_reqs=num_reqs_padded,
+                max_query_len=max_query_len,
                 ubatch_slices=ubatch_slices,
                 for_cudagraph_capture=True,
             )
@@ -3557,27 +4010,29 @@ def _dummy_run(
             remove_lora,
         ):
             # Make sure padding doesn't exceed max_num_tokens
-            assert num_tokens_after_padding <= self.max_num_tokens
-            model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
+            assert num_tokens_padded <= self.max_num_tokens
+            model_kwargs = self._init_model_kwargs(num_tokens_padded)
             if self.supports_mm_inputs and not self.model_config.is_encoder_decoder:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
                 model_kwargs = {
                     **model_kwargs,
                     **self._dummy_mm_kwargs(num_reqs),
                 }
             elif self.enable_prompt_embeds:
                 input_ids = None
-                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_after_padding]
-                model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
+                inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded]
+                model_kwargs = self._init_model_kwargs(num_tokens_padded)
             else:
-                input_ids = self.input_ids.gpu[:num_tokens_after_padding]
+                input_ids = self.input_ids.gpu[:num_tokens_padded]
                 inputs_embeds = None
 
             if self.uses_mrope:
-                positions = self.mrope_positions.gpu[:, :num_tokens_after_padding]
+                positions = self.mrope_positions.gpu[:, :num_tokens_padded]
+            elif self.uses_xdrope_dim > 0:
+                positions = self.xdrope_positions.gpu[:, :num_tokens_padded]
             else:
-                positions = self.positions.gpu[:num_tokens_after_padding]
+                positions = self.positions.gpu[:num_tokens_padded]
 
             if get_pp_group().is_first_rank:
                 intermediate_tensors = None
@@ -3592,51 +4047,26 @@ def _dummy_run(
                     )
 
                 intermediate_tensors = self.sync_and_slice_intermediate_tensors(
-                    num_tokens_after_padding, None, False
-                )
-
-            # filter out the valid batch descriptor
-            _cg_mode, batch_descriptor = (
-                self.cudagraph_dispatcher.dispatch(
-                    BatchDescriptor(
-                        num_tokens=num_tokens_after_padding,
-                        uniform_decode=uniform_decode,
-                        has_lora=activate_lora and self.lora_config is not None,
-                    )
-                )
-                if not is_profile
-                else (CUDAGraphMode.NONE, None)
-            )
-            if cudagraph_runtime_mode is not None:
-                # we allow forcing NONE when the dispatcher disagrees to support
-                # warm ups for cudagraph capture
-                assert (
-                    cudagraph_runtime_mode == CUDAGraphMode.NONE
-                    or cudagraph_runtime_mode == _cg_mode
-                ), (
-                    f"Cudagraph runtime mode mismatch at dummy_run. "
-                    f"Expected {_cg_mode}, but got {cudagraph_runtime_mode}."
+                    num_tokens_padded, None, False
                 )
-            else:
-                cudagraph_runtime_mode = _cg_mode
 
             if ubatch_slices is not None:
                 # Adjust values to reflect a single ubatch.
                 # TODO(sage,lucas): this is cruft that should be addressed in
                 #  the padding refactor.
-                num_tokens_after_padding = ubatch_slices[0].num_tokens
+                num_tokens_padded = ubatch_slices[0].num_tokens
                 if num_tokens_across_dp is not None:
-                    num_tokens_across_dp[:] = num_tokens_after_padding
+                    num_tokens_across_dp[:] = num_tokens_padded
 
             with (
                 self.maybe_randomize_inputs(input_ids),
                 set_forward_context(
                     attn_metadata,
                     self.vllm_config,
-                    num_tokens=num_tokens_after_padding,
+                    num_tokens=num_tokens_padded,
                     num_tokens_across_dp=num_tokens_across_dp,
                     cudagraph_runtime_mode=cudagraph_runtime_mode,
-                    batch_descriptor=batch_descriptor,
+                    batch_descriptor=batch_desc,
                     ubatch_slices=ubatch_slices,
                 ),
             ):
@@ -3656,7 +4086,7 @@ def _dummy_run(
             if self.speculative_config and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
                 use_cudagraphs = (
-                    cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE
+                    cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
                     and not self.speculative_config.enforce_eager
                 )
 
@@ -3670,6 +4100,7 @@ def _dummy_run(
                 self.drafter.dummy_run(
                     num_tokens,
                     use_cudagraphs=use_cudagraphs,
+                    is_graph_capturing=is_graph_capturing,
                 )
 
         # This is necessary to avoid blocking DP.
@@ -3825,7 +4256,7 @@ def _dummy_pooler_run(
         supported_pooling_tasks = self.get_supported_pooling_tasks()
 
         if not supported_pooling_tasks:
-            if self.scheduler_config.chunked_prefill_enabled:
+            if self.scheduler_config.enable_chunked_prefill:
                 raise RuntimeError(
                     f"Model {self.model_config.model} does not support "
                     "any pooling tasks with chunked prefill enabled. "
@@ -3855,7 +4286,8 @@ def _dummy_pooler_run(
     def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            if self.model_config.multimodal_config.skip_mm_profiling:
+            mm_config = self.model_config.multimodal_config
+            if mm_config is not None and mm_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
                     "encoder cache."
@@ -3901,14 +4333,18 @@ def profile_run(self) -> None:
                     # NOTE: This happens when encoder cache needs to store
                     # the embeddings that encoder outputs are scattered onto.
                     # In this case we create dummy embeddings of size
-                    # (encode_budget, hidden_size) and scatter encoder
-                    # output into it.
+                    # (max_tokens_for_modality, hidden_size) and scatter
+                    # encoder output into it.
                     encoder_output_shape = dummy_encoder_outputs[0].shape
-                    if encoder_output_shape[0] < encoder_budget:
+                    max_mm_tokens_per_item = mm_budget.max_tokens_by_modality[
+                        dummy_modality
+                    ]
+                    if encoder_output_shape[0] < max_mm_tokens_per_item:
+                        encoder_hidden_size = encoder_output_shape[-1]
                         expanded_outputs = []
                         for output in dummy_encoder_outputs:
                             expanded = output.new_zeros(
-                                (encoder_budget, encoder_output_shape[-1])
+                                (max_mm_tokens_per_item, encoder_hidden_size)
                             )
                             num_tokens = output.shape[0]
                             expanded[:num_tokens].copy_(output)
@@ -4101,6 +4537,7 @@ def _capture_cudagraphs(
                 skip_eplb=True,
                 remove_lora=False,
                 activate_lora=activate_lora,
+                is_graph_capturing=True,
             )
         self.maybe_remove_all_loras(self.lora_config)
 
@@ -4117,8 +4554,9 @@ class AttentionGroupKey(NamedTuple):
         def get_attn_backends_for_group(
             kv_cache_group_spec: KVCacheGroupSpec,
         ) -> tuple[dict[AttentionGroupKey, list[str]], set[type[AttentionBackend]]]:
+            layer_type = cast(type[Any], AttentionLayerBase)
             layers = get_layers_from_vllm_config(
-                self.vllm_config, AttentionLayerBase, kv_cache_group_spec.layer_names
+                self.vllm_config, layer_type, kv_cache_group_spec.layer_names
             )
             attn_backends = {}
             attn_backend_layers = defaultdict(list)
@@ -4133,7 +4571,7 @@ def get_attn_backends_for_group(
                 if layer_name in self.kv_sharing_fast_prefill_eligible_layers:
                     attn_backend = create_fast_prefill_custom_backend(
                         "FastPrefill",
-                        attn_backend,
+                        attn_backend,  # type: ignore[arg-type]
                     )
 
                 full_cls_name = attn_backend.full_cls_name()
@@ -4232,6 +4670,7 @@ def _check_and_update_cudagraph_mode(
                     min_cg_backend_name = attn_backend.__name__
         # Flexible resolve the cudagraph mode
         cudagraph_mode = self.compilation_config.cudagraph_mode
+        assert cudagraph_mode is not None
         # check cudagraph for mixed batch is supported
         if (
             cudagraph_mode.mixed_mode() == CUDAGraphMode.FULL
@@ -4332,10 +4771,30 @@ def _check_and_update_cudagraph_mode(
                 "and make sure compilation mode is VLLM_COMPILE"
             )
 
+        # if we have dedicated decode cudagraphs, and spec-decode is enabled,
+        # we need to adjust the cudagraph sizes to be a multiple of the uniform
+        # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207
+        # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536
+        # Will be removed in the near future when we have seperate cudagraph capture
+        # sizes for decode and mixed prefill-decode.
+        if (
+            cudagraph_mode.decode_mode() == CUDAGraphMode.FULL
+            and cudagraph_mode.separate_routine()
+            and self.uniform_decode_query_len > 1
+        ):
+            self.compilation_config.adjust_cudagraph_sizes_for_spec_decode(
+                self.uniform_decode_query_len, self.parallel_config.tensor_parallel_size
+            )
+            capture_sizes = self.compilation_config.cudagraph_capture_sizes
+            self.cudagraph_batch_sizes = (
+                capture_sizes if capture_sizes is not None else []
+            )
+
         # Trigger cudagraph dispatching keys initialization after
         # resolved cudagraph mode.
+        self.compilation_config.cudagraph_mode = cudagraph_mode
         self.cudagraph_dispatcher.initialize_cudagraph_keys(
-            self.compilation_config.cudagraph_mode, self.uniform_decode_query_len
+            cudagraph_mode, self.uniform_decode_query_len
         )
 
     def calculate_reorder_batch_threshold(self) -> None:
@@ -4347,7 +4806,7 @@ def calculate_reorder_batch_threshold(self) -> None:
         """
         min_none_high = lambda a, b: a if b is None else b if a is None else min(a, b)
 
-        reorder_batch_thresholds = [
+        reorder_batch_thresholds: list[int | None] = [
             group.get_metadata_builder().reorder_batch_threshold
             for group in self._attn_group_iterator()
         ]
@@ -4356,7 +4815,7 @@ def calculate_reorder_batch_threshold(self) -> None:
         if len(reorder_batch_thresholds) == 0:
             self.reorder_batch_threshold = None
             return
-        self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)
+        self.reorder_batch_threshold = reduce(min_none_high, reorder_batch_thresholds)  # type: ignore[assignment]
 
     @staticmethod
     def select_common_block_size(
@@ -4388,7 +4847,7 @@ def block_size_is_supported(
             """
             for backend in backends:
                 is_supported = False
-                for supported_size in backend.supported_kernel_block_sizes:
+                for supported_size in backend.get_supported_kernel_block_sizes():
                     if isinstance(supported_size, int):
                         if block_size == supported_size:
                             is_supported = True
@@ -4419,7 +4878,7 @@ def block_size_is_supported(
         all_int_supported_sizes = set(
             supported_size
             for backend in backends
-            for supported_size in backend.supported_kernel_block_sizes
+            for supported_size in backend.get_supported_kernel_block_sizes()
             if isinstance(supported_size, int)
         )
 
@@ -4469,11 +4928,7 @@ def may_reinitialize_input_batch(
                 logitsprocs=self.input_batch.logitsprocs,
                 logitsprocs_need_output_token_ids=self.input_batch.logitsprocs_need_output_token_ids,
                 is_pooling_model=self.is_pooling_model,
-                num_speculative_tokens=(
-                    self.vllm_config.speculative_config.num_speculative_tokens
-                    if self.vllm_config.speculative_config
-                    else 0
-                ),
+                num_speculative_tokens=self.num_spec_tokens,
             )
 
     def _allocate_kv_cache_tensors(
@@ -4705,12 +5160,30 @@ def initialize_kv_cache_tensors(
             Dict[str, torch.Tensor]: A map between layer names to their
             corresponding memory buffer for KV cache.
         """
-        # Initialize the memory buffer for KV cache
-        kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
-        # Change the memory buffer to the desired shape
-        kv_caches = self._reshape_kv_cache_tensors(
-            kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
-        )
+
+        # Try creating KV caches optimized for kv-connector transfers
+        cache_dtype = self.cache_config.cache_dtype
+        if self.use_uniform_kv_cache(self.attn_groups, cache_dtype):
+            kv_caches, cross_layers_kv_cache, attn_backend = (
+                self.allocate_uniform_kv_caches(
+                    kv_cache_config,
+                    self.attn_groups,
+                    cache_dtype,
+                    self.device,
+                    kernel_block_sizes,
+                )
+            )
+            self.cross_layers_kv_cache = cross_layers_kv_cache
+            self.cross_layers_attn_backend = attn_backend
+        else:
+            # Fallback to the general case
+            # Initialize the memory buffer for KV cache
+            kv_cache_raw_tensors = self._allocate_kv_cache_tensors(kv_cache_config)
+
+            # Change the memory buffer to the desired shape
+            kv_caches = self._reshape_kv_cache_tensors(
+                kv_cache_config, kv_cache_raw_tensors, kernel_block_sizes
+            )
 
         # Set up cross-layer KV cache sharing
         for layer_name, target_layer_name in self.shared_kv_cache_layers.items():
@@ -4792,16 +5265,26 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
 
         if has_kv_transfer_group():
             kv_transfer_group = get_kv_transfer_group()
-            kv_transfer_group.register_kv_caches(kv_caches)
+            if self.cross_layers_kv_cache is not None:
+                assert self.cross_layers_attn_backend is not None
+                kv_transfer_group.register_cross_layers_kv_cache(
+                    self.cross_layers_kv_cache, self.cross_layers_attn_backend
+                )
+            else:
+                kv_transfer_group.register_kv_caches(kv_caches)
             kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks)
 
         if self.dcp_world_size > 1:
-            layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+            layer_type = cast(type[Any], AttentionLayerBase)
+            layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
             for layer in layers.values():
-                assert layer.impl.need_to_return_lse_for_decode, (
+                layer_impl = getattr(layer, "impl", None)
+                if layer_impl is None:
+                    continue
+                assert layer_impl.need_to_return_lse_for_decode, (
                     "DCP requires attention impls to return"
                     " the softmax lse for decode, but the impl "
-                    f"{layer.impl.__class__.__name__} "
+                    f"{layer_impl.__class__.__name__} "
                     "does not return the softmax lse for decode."
                 )
 
@@ -4842,7 +5325,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         if has_ec_transfer() and get_ec_transfer().is_producer:
             return {}
         kv_cache_spec: dict[str, KVCacheSpec] = {}
-        attn_layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+        layer_type = cast(type[Any], AttentionLayerBase)
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, layer_type)
         for layer_name, attn_module in attn_layers.items():
             if isinstance(attn_module, Attention) and (
                 kv_tgt_layer := attn_module.kv_sharing_target_layer_name
diff --git a/vllm/v1/worker/gpu_ubatch_wrapper.py b/vllm/v1/worker/gpu_ubatch_wrapper.py
index 9de123263755..2ce2b6451256 100644
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@@ -121,18 +121,24 @@ def __init__(
 
     @staticmethod
     def _create_sm_control_context(vllm_config: VllmConfig):
-        comm_sms = envs.VLLM_DBO_COMM_SMS
+        comm_sms: int = envs.VLLM_DBO_COMM_SMS
 
         set_comm_sms = lambda sms: None
         if vllm_config.parallel_config.enable_expert_parallel:
             # Currently only DeepEP highthroughput supports SM control so this
             # only affects that case.
-            all2all_manager = get_ep_group().device_communicator.all2all_manager
-
-            if all2all_manager.max_sms_used() is not None:
-                comm_sms = min(comm_sms, all2all_manager.max_sms_used())
-
-            if comm_sms > 0:
+            ep_group = get_ep_group()
+            device_communicator = ep_group.device_communicator
+            all2all_manager = None
+            if device_communicator is not None:
+                all2all_manager = device_communicator.all2all_manager
+
+            if all2all_manager is not None:
+                max_sms_used = all2all_manager.max_sms_used()
+                if max_sms_used is not None:
+                    comm_sms = min(comm_sms, max_sms_used)
+
+            if comm_sms > 0 and all2all_manager is not None:
                 set_comm_sms = lambda sms: all2all_manager.set_num_sms(sms)
 
         # TODO(lucas): support other kernels besides DeepGEMM
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 2b9d8bb2f25e..ed6fb32bcb2f 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -2,19 +2,19 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """A GPU worker class."""
 
-import copy
 import gc
 import os
 from contextlib import AbstractContextManager, nullcontext
 from types import NoneType
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING, Any, cast
 
+import numpy as np
 import torch
 import torch.distributed
 import torch.nn as nn
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import (
     ensure_model_parallel_initialized,
     init_distributed_environment,
@@ -27,6 +27,7 @@
     has_kv_transfer_group,
 )
 from vllm.distributed.parallel_state import (
+    get_pcp_group,
     get_pp_group,
     get_tp_group,
 )
@@ -36,16 +37,15 @@
 from vllm.model_executor.models.interfaces import is_mixture_of_experts
 from vllm.model_executor.warmup.kernel_warmup import kernel_warmup
 from vllm.platforms import current_platform
-from vllm.profiler.gpu_profiler import CudaProfilerWrapper
+from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
-from vllm.v1.core.sched.output import GrammarOutput
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import (
-    EMPTY_MODEL_RUNNER_OUTPUT,
     AsyncModelRunnerOutput,
     DraftTokenIds,
     ModelRunnerOutput,
@@ -59,7 +59,6 @@
 
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-    from vllm.v1.core.sched.output import SchedulerOutput
 
 
 class Worker(WorkerBase):
@@ -88,41 +87,22 @@ def __init__(
         # Buffers saved before sleep
         self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
 
-        # Torch profiler. Enabled and configured through env vars:
+        # Torch/CUDA profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        # VLLM_TORCH_CUDA_PROFILE=1
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
-            logger.info(
-                "Profiling enabled. Traces will be saved to: %s",
-                torch_profiler_trace_dir,
-            )
-            logger.debug(
-                "Profiler config: record_shapes=%s,"
-                "profile_memory=%s,with_stack=%s,with_flops=%s",
-                envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-            )
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES,
-                profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY,
-                with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
-                with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
-                ),
+            self.profiler = TorchProfilerWrapper(
+                worker_name=worker_name, local_rank=self.local_rank
             )
         elif envs.VLLM_TORCH_CUDA_PROFILE:
             self.profiler = CudaProfilerWrapper()
         else:
             self.profiler = None
 
+        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
+
     def sleep(self, level: int = 1) -> None:
         from vllm.device_allocator.cumem import CuMemAllocator
 
@@ -161,6 +141,16 @@ def wake_up(self, tags: list[str] | None = None) -> None:
                     buffer.data.copy_(self._sleep_saved_buffers[name].data)
             self._sleep_saved_buffers = {}
 
+        # If the KV cache has just been woken up,
+        # the internal state of cache_engine must be reset,
+        # especially the FP8 scaling factor.
+        if (
+            (tags is None or "kv_cache" in tags)
+            and self.cache_config.cache_dtype.startswith("fp8")
+            and hasattr(self.model_runner, "init_fp8_kv_scales")
+        ):
+            self.model_runner.init_fp8_kv_scales()
+
     def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
         if self.vllm_config.model_config.enable_sleep_mode:
             from vllm.device_allocator.cumem import CuMemAllocator
@@ -170,17 +160,17 @@ def _maybe_get_memory_pool_context(self, tag: str) -> AbstractContextManager:
                 assert allocator.get_current_usage() == 0, (
                     "Sleep mode can only be used for one instance per process."
                 )
-            context = allocator.use_memory_pool(tag=tag)
+            return allocator.use_memory_pool(tag=tag)
         else:
-            context = nullcontext()
-        return context
+            return nullcontext()
 
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
 
     def init_device(self):
-        if self.device_config.device.type == "cuda":
+        device = self.device_config.device
+        if isinstance(device, torch.device) and device.type == "cuda":
             # This env var set by Ray causes exceptions with graph building.
             os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
             if (
@@ -189,6 +179,7 @@ def init_device(self):
                 and self.parallel_config.distributed_executor_backend
                 not in ["ray", "external_launcher"]
                 and self.vllm_config.parallel_config.data_parallel_backend != "ray"
+                and self.vllm_config.parallel_config.nnodes_within_dp == 1
             ):
                 # Use local DP rank if available, otherwise use global DP rank.
                 dp_local_rank = self.parallel_config.data_parallel_rank_local
@@ -205,7 +196,14 @@ def init_device(self):
                 assert self.local_rank < torch.cuda.device_count(), (
                     f"DP adjusted local rank {self.local_rank} is out of bounds. "
                 )
-
+                visible_device_count = (
+                    torch.cuda.device_count() if torch.cuda.is_available() else 0
+                )
+                assert self.parallel_config.local_world_size <= visible_device_count, (
+                    f"local_world_size ({self.parallel_config.local_world_size}) must "
+                    f"be less than or equal to the number of visible devices "
+                    f"({visible_device_count})."
+                )
             self.device = torch.device(f"cuda:{self.local_rank}")
             current_platform.set_device(self.device)
 
@@ -251,9 +249,17 @@ def init_device(self):
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
         # Construct the model runner
-        self.model_runner: GPUModelRunner = GPUModelRunner(
-            self.vllm_config, self.device
-        )
+        if self.use_v2_model_runner:
+            from vllm.v1.worker.gpu.model_runner import (
+                GPUModelRunner as GPUModelRunnerV2,
+            )
+
+            # HACK(woosuk): This is a temporary fix to avoid type errors.
+            self.model_runner: GPUModelRunner = GPUModelRunnerV2(  # type: ignore
+                self.vllm_config, self.device
+            )
+        else:
+            self.model_runner = GPUModelRunner(self.vllm_config, self.device)
 
         if self.rank == 0:
             # If usage stat is enabled, collect relevant info.
@@ -391,23 +397,21 @@ def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
             from vllm.device_allocator.cumem import CuMemAllocator
 
             allocator = CuMemAllocator.get_instance()
-            context = allocator.use_memory_pool(tag="kv_cache")
+            with allocator.use_memory_pool(tag="kv_cache"):
+                self.model_runner.initialize_kv_cache(kv_cache_config)
         else:
-            context = nullcontext()
-        with context:
             self.model_runner.initialize_kv_cache(kv_cache_config)
 
     def compile_or_warm_up_model(self) -> None:
         # warm up sizes that are not in cudagraph capture sizes,
         # but users still want to compile for better performance,
         # e.g. for the max-num-batched token size in chunked prefill.
-        warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
+        compile_sizes = self.vllm_config.compilation_config.compile_sizes
+        warmup_sizes = compile_sizes.copy() if compile_sizes is not None else []
         if not self.model_config.enforce_eager:
-            warmup_sizes = [
-                x
-                for x in warmup_sizes
-                if x not in self.vllm_config.compilation_config.cudagraph_capture_sizes
-            ]
+            capture_sizes = self.vllm_config.compilation_config.cudagraph_capture_sizes
+            if capture_sizes is not None:
+                warmup_sizes = [x for x in warmup_sizes if x not in capture_sizes]
         # We skip EPLB here since we don't want to record dummy metrics
         for size in sorted(warmup_sizes, reverse=True):
             logger.info("Compile and warming up model for size %d", size)
@@ -494,6 +498,7 @@ def compile_or_warm_up_model(self) -> None:
             hidden_states, last_hidden_states = self.model_runner._dummy_run(
                 num_tokens=max_num_reqs,
                 skip_eplb=True,
+                cudagraph_runtime_mode=CUDAGraphMode.NONE,
             )
             if self.model_runner.is_pooling_model:
                 self.model_runner._dummy_pooler_run(hidden_states)
@@ -519,10 +524,12 @@ def annotate_profile(self, scheduler_output):
         if not self.profiler:
             return nullcontext()
 
+        self.profiler.step()
+
         num_new = len(scheduler_output.scheduled_new_reqs)
         num_cached = len(scheduler_output.scheduled_cached_reqs.req_ids)
 
-        return torch.profiler.record_function(
+        return self.profiler.annotate_context_manager(
             f"execute_new_{num_new}_cached_{num_cached}"
         )
 
@@ -539,19 +546,46 @@ def execute_model(
         intermediate_tensors = None
         forward_pass = scheduler_output.total_num_scheduled_tokens > 0
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
-        num_input_tokens = self.model_runner._get_num_input_tokens(num_scheduled_tokens)
-        all_gather_tensors = {
-            "residual": not is_residual_scattered_for_sp(
-                self.vllm_config, num_input_tokens
+        all_gather_tensors = {}
+        compilation_config = self.vllm_config.compilation_config
+        parallel_config = self.vllm_config.parallel_config
+
+        if (
+            parallel_config.pipeline_parallel_size > 1
+            and compilation_config.pass_config.enable_sequence_parallelism
+            and forward_pass
+        ):
+            # currently only supported by V1 GPUModelRunner
+            assert isinstance(self.model_runner, GPUModelRunner)
+            num_scheduled_tokens_np = np.array(
+                list(scheduler_output.num_scheduled_tokens.values()),
+                dtype=np.int32,
             )
-        }
-        if forward_pass and not get_pp_group().is_first_rank:
-            intermediate_tensors = IntermediateTensors(
-                get_pp_group().recv_tensor_dict(
-                    all_gather_group=get_tp_group(),
-                    all_gather_tensors=all_gather_tensors,
+            # TODO(lucas): This is pretty gross; ideally we should only ever call
+            # `_determine_batch_execution_and_padding` once (will get called again
+            # in `execute_model`) but this requires a larger refactor of PP.
+            _, batch_desc, _, _ = (
+                self.model_runner._determine_batch_execution_and_padding(
+                    num_tokens=num_scheduled_tokens,
+                    num_reqs=len(num_scheduled_tokens_np),
+                    num_scheduled_tokens_np=num_scheduled_tokens_np,
+                    max_num_scheduled_tokens=num_scheduled_tokens_np.max(),
+                    use_cascade_attn=False,  # TODO(lucas): Handle cascade attention
                 )
             )
+            all_gather_tensors = {
+                "residual": not is_residual_scattered_for_sp(
+                    self.vllm_config, batch_desc.num_tokens
+                )
+            }
+
+        if forward_pass and not get_pp_group().is_first_rank:
+            tensor_dict = get_pp_group().recv_tensor_dict(
+                all_gather_group=get_tp_group(),
+                all_gather_tensors=all_gather_tensors,
+            )
+            assert tensor_dict is not None
+            intermediate_tensors = IntermediateTensors(tensor_dict)
 
         with self.annotate_profile(scheduler_output):
             output = self.model_runner.execute_model(
@@ -573,40 +607,26 @@ def execute_model(
             all_gather_tensors=all_gather_tensors,
         )
 
-        kv_connector_output = output.kv_connector_output
-        if not kv_connector_output:
-            return None
-
-        # In case of PP with kv transfer, we need to pass through the
-        # kv_connector_output
-        if kv_connector_output.is_empty():
-            return EMPTY_MODEL_RUNNER_OUTPUT
-
-        output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT)
-        output.kv_connector_output = kv_connector_output
-        return output
+        return None
 
     def take_draft_token_ids(self) -> DraftTokenIds | None:
         return self.model_runner.take_draft_token_ids()
 
     def profile(self, is_start: bool = True):
         if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
+            raise RuntimeError("Profiling is not enabled.")
         if is_start:
             self.profiler.start()
         else:
             self.profiler.stop()
-            # only print profiler results on rank 0
-            if (
-                isinstance(self.profiler, torch.profiler.profile)
-                and self.local_rank == 0
-            ):
-                print(
-                    self.profiler.key_averages().table(sort_by="self_cuda_time_total")
-                )
 
     def execute_dummy_batch(self) -> None:
-        self.model_runner._dummy_run(1, uniform_decode=True)
+        if self.use_v2_model_runner:
+            self.model_runner.execute_model(
+                SchedulerOutput.make_empty(), dummy_run=True
+            )
+        else:
+            self.model_runner._dummy_run(1, uniform_decode=True)
 
     def add_lora(self, lora_request: LoRARequest) -> bool:
         return self.model_runner.add_lora(lora_request)
@@ -638,7 +658,7 @@ def _eplb_before_scale_down(self, old_ep_size: int, new_ep_size: int) -> None:
         assert self.model_runner.eplb_state is not None
         self.model_runner.eplb_state.rearrange(
             execute_shuffle=True,
-            global_expert_load=None,
+            global_expert_loads=None,
             rank_mapping=rank_mapping,
         )
         torch.cuda.synchronize()
@@ -694,7 +714,7 @@ def _reconfigure_parallel_config(
 
     def _reconfigure_moe(
         self, old_ep_size: int, new_ep_size: int
-    ) -> torch.Tensor | None:
+    ) -> list[torch.Tensor] | None:
         """
         Reconfigure MoE modules with provided reconfig_request
 
@@ -733,6 +753,7 @@ def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int):
                 module.global_num_experts = module.moe_config.num_experts
                 module.moe_parallel_config = FusedMoEParallelConfig.make(
                     tp_size_=get_tp_group().world_size,
+                    pcp_size_=get_pcp_group().world_size,
                     dp_size_=get_dp_group().world_size,
                     vllm_parallel_config=parallel_config,
                 )
@@ -760,26 +781,29 @@ def update_moe_modules(moe_modules: list[FusedMoE], num_local_experts: int):
             num_local_physical_experts = num_local_experts
             assert self.model_runner.eplb_state is not None
             new_physical_experts = (
-                self.model_runner.eplb_state.physical_to_logical_map.shape[1]
+                self.model_runner.eplb_state.physical_to_logical_map.shape[1]  # type: ignore[attr-defined]
             )
             parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts
-                - self.model_runner.eplb_state.logical_replica_count.shape[1]
+                - self.model_runner.eplb_state.logical_replica_count.shape[1]  # type: ignore[attr-defined]
             )
             global_expert_loads = None
         else:
-            num_local_physical_experts = torch.tensor(
+            num_local_physical_experts_tensor = torch.tensor(
                 [num_local_experts], dtype=torch.int32, device="cpu"
             )
             torch.distributed.broadcast(
-                num_local_physical_experts, group=get_ep_group().cpu_group, group_src=0
+                num_local_physical_experts_tensor,
+                group=get_ep_group().cpu_group,
+                group_src=0,
             )
-            num_local_physical_experts = num_local_physical_experts.item()
+            num_local_physical_experts = int(num_local_physical_experts_tensor.item())
             new_physical_experts = num_local_physical_experts * new_ep_size
             assert self.model_runner.eplb_state is not None
-            global_expert_loads = self.model_runner.eplb_state.rearrange(
+            global_expert_loads_any = self.model_runner.eplb_state.rearrange(
                 execute_shuffle=False
             )
+            global_expert_loads = cast(list[torch.Tensor], global_expert_loads_any)
             parallel_config.eplb_config.num_redundant_experts = (
                 new_physical_experts - global_expert_loads[0].shape[1]
             )
@@ -863,6 +887,8 @@ def save_tensorized_model(
     def shutdown(self) -> None:
         if runner := getattr(self, "model_runner", None):
             runner.ensure_kv_transfer_shutdown()
+        if self.profiler is not None:
+            self.profiler.shutdown()
 
 
 def init_worker_distributed_environment(
@@ -879,13 +905,15 @@ def init_worker_distributed_environment(
     init_batch_invariance()
     set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
 
+    init_method = distributed_init_method or "env://"
     init_distributed_environment(
-        parallel_config.world_size, rank, distributed_init_method, local_rank, backend
+        parallel_config.world_size, rank, init_method, local_rank, backend
     )
 
     ensure_model_parallel_initialized(
         parallel_config.tensor_parallel_size,
         parallel_config.pipeline_parallel_size,
+        parallel_config.prefill_context_parallel_size,
         parallel_config.decode_context_parallel_size,
     )
 
diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py
index db037a9fccd5..b799f1be73d9 100644
--- a/vllm/v1/worker/kv_connector_model_runner_mixin.py
+++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py
@@ -11,7 +11,11 @@
     TYPE_CHECKING,  # noqa: UP035
 )
 
+import torch
+
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.config import VllmConfig
+from vllm.config.cache import CacheDType
 from vllm.distributed.kv_transfer import (
     ensure_kv_transfer_shutdown,
     get_kv_transfer_group,
@@ -21,11 +25,13 @@
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
+from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig
 from vllm.v1.outputs import (
     EMPTY_MODEL_RUNNER_OUTPUT,
     KVConnectorOutput,
     ModelRunnerOutput,
 )
+from vllm.v1.worker.utils import AttentionGroup
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -53,7 +59,7 @@ def maybe_setup_kv_connector(scheduler_output: "SchedulerOutput"):
     @staticmethod
     def ensure_kv_transfer_shutdown() -> None:
         # has_kv_transfer_group can be None during interpreter shutdown.
-        if has_kv_transfer_group and has_kv_transfer_group():
+        if has_kv_transfer_group and has_kv_transfer_group():  # type: ignore[truthy-function]
             ensure_kv_transfer_shutdown()
 
     @staticmethod
@@ -142,3 +148,162 @@ def get_kv_connector_stats() -> KVConnectorStats | None:
         if has_kv_transfer_group():
             return get_kv_transfer_group().get_kv_connector_stats()
         return None
+
+    @staticmethod
+    def use_uniform_kv_cache(
+        attn_groups: list[list[AttentionGroup]],
+        cache_dtype: CacheDType,
+    ) -> bool:
+        """
+        Determines whether a uniform KV layout should be used.
+        A uniform layout means all layers KV caches will share the same
+        underlying tensor, where for a given block number, the respective
+        KV data for all layers will be contiguous.
+        This will allow efficient KV transfer of per-block KV data for all
+        layers at once.
+        Note this layout will only be applied given 3 conditions:
+        1. The KV Cache config contains just a single group where all layers
+            have the same page size.
+        2. A KV connector is configured, and the KV connector instance prefers
+            to use this layout (prefer_cross_layer_blocks() returns True)
+        2. The flash attention backend supports this layout
+            (get_kv_cache_stride_order(True) includes a placement for a
+            num_layers dimension)
+
+        Note that the actual placement of the num_layers dimensions
+        in the unified layers tensors will be determined by the attention
+        backend.
+        Thus, the layers KV data may still not be contiguous per block
+        if the attention backend does not support it.
+
+        Args:
+            attn_groups: The list of attention groups for this model
+            cache_dtype: The KV cache dtype
+        Returns:
+            True if we should use a uniform KV cache layout.
+        """
+
+        if not has_kv_transfer_group():
+            return False
+        if not get_kv_transfer_group().prefer_cross_layer_blocks:
+            return False
+
+        if len(attn_groups) != 1 or len(attn_groups[0]) != 1:
+            return False
+
+        attn_group = attn_groups[0][0]
+        kv_cache_spec = attn_group.kv_cache_spec
+        if not isinstance(kv_cache_spec, AttentionSpec):
+            return False
+
+        attn_backend = attn_group.backend
+        kv_cache_shape = attn_backend.get_kv_cache_shape(
+            1234,
+            kv_cache_spec.block_size,
+            kv_cache_spec.num_kv_heads,
+            kv_cache_spec.head_size,
+            cache_dtype_str=cache_dtype,
+        )
+
+        try:
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                include_num_layers_dimension=True
+            )
+        except (AttributeError, NotImplementedError):
+            return False
+
+        # check that attention backend include a layers dimension
+        return len(kv_cache_stride_order) == len(kv_cache_shape) + 1
+
+    @staticmethod
+    def allocate_uniform_kv_caches(
+        kv_cache_config: KVCacheConfig,
+        attn_groups: list[list[AttentionGroup]],
+        cache_dtype: CacheDType,
+        device: torch.device,
+        kernel_block_sizes: list[int],
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, type[AttentionBackend]]:
+        """
+        Initializes and reshapes KV caches for the simple case where all
+        layers have the same layout.
+
+        This function assumes use_uniform_kv_cache() returned True.
+
+        Args:
+            kv_cache_config: The KV cache config
+            attn_groups: The list of attention groups for this model
+            cache_dtype: The KV cache dtype
+            device: The torch device to allocate on.
+            kernel_block_sizes: The kernel block sizes for each KV cache group.
+        Returns:
+            A tuple (kv_caches, cross_layers_kv_cache, attn_backend) where:
+                kv_caches is a dict mapping between layer names to their
+                    corresponding memory buffer for KV cache.
+                cross_layers_kv_cache is the cross layers kv cache tensor
+                attn_backend is the attention backend matching this tensor
+        """
+        attn_group = attn_groups[0][0]
+        kv_cache_spec = attn_group.kv_cache_spec
+        assert isinstance(kv_cache_spec, AttentionSpec)
+
+        tensor_sizes = set(
+            kv_cache_tensor.size for kv_cache_tensor in kv_cache_config.kv_cache_tensors
+        )
+        assert len(tensor_sizes) == 1
+        tensor_size = tensor_sizes.pop()
+
+        page_size = kv_cache_spec.page_size_bytes
+        assert tensor_size % page_size == 0
+        num_blocks = tensor_size // page_size
+        num_layers = len(kv_cache_config.kv_cache_tensors)
+        total_size = tensor_size * num_layers
+
+        assert len(kernel_block_sizes) == 1
+        kernel_block_size = kernel_block_sizes[0]
+        num_blocks_per_kv_block = kv_cache_spec.block_size // kernel_block_size
+        kernel_num_blocks = num_blocks * num_blocks_per_kv_block
+
+        attn_backend = attn_group.backend
+        kv_cache_shape = attn_backend.get_kv_cache_shape(
+            kernel_num_blocks,
+            kernel_block_size,
+            kv_cache_spec.num_kv_heads,
+            kv_cache_spec.head_size,
+            cache_dtype_str=cache_dtype,
+        )
+
+        # prepend a num_layers dimension into the shape
+        kv_cache_shape = (num_layers,) + kv_cache_shape
+
+        try:
+            kv_cache_stride_order = attn_backend.get_kv_cache_stride_order(
+                include_num_layers_dimension=True
+            )
+            assert len(kv_cache_stride_order) == len(kv_cache_shape)
+        except (AttributeError, NotImplementedError):
+            kv_cache_stride_order = tuple(range(len(kv_cache_shape)))
+
+        kv_cache_shape = tuple(kv_cache_shape[i] for i in kv_cache_stride_order)
+
+        logger.info("Allocating a cross layer KV cache of shape %s", kv_cache_shape)
+
+        # allocate one contiguous buffer for all layers
+        cross_layers_kv_cache = (
+            torch.zeros(total_size, dtype=torch.int8, device=device)
+            .view(kv_cache_spec.dtype)
+            .view(kv_cache_shape)
+        )
+
+        # Maintain original KV shape view.
+        inv_order = [
+            kv_cache_stride_order.index(i) for i in range(len(kv_cache_stride_order))
+        ]
+        permuted_kv_cache = cross_layers_kv_cache.permute(*inv_order)
+
+        kv_caches = {}
+        for i, kv_cache_tensor in enumerate(kv_cache_config.kv_cache_tensors):
+            tensor = permuted_kv_cache[i]
+            for layer_name in kv_cache_tensor.shared_by:
+                kv_caches[layer_name] = tensor
+
+        return kv_caches, cross_layers_kv_cache, attn_backend
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 37abe5649460..a67246146005 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -43,7 +43,6 @@ def load_lora_model(
             vllm_config,
             device,
             model.embedding_modules,
-            model.embedding_padding_modules,
         )
         return self.lora_manager.create_lora_manager(model)
 
diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py
index 6bf4f9193184..2ed65ca9d31c 100644
--- a/vllm/v1/worker/tpu_input_batch.py
+++ b/vllm/v1/worker/tpu_input_batch.py
@@ -149,9 +149,6 @@ def __init__(
         self.generators: dict[int, torch.Generator] = {}
 
         self.num_logprobs: dict[str, int] = {}
-        # NOTE(rob): num_prompt_logprobs only includes reqs
-        # that are currently in the prefill phase.
-        self.num_prompt_logprobs: dict[str, int] = {}
 
         # To accumulate prompt logprobs tensor chunks across prefill steps.
         self.in_progress_prompt_logprobs_cpu: dict[str, LogprobsTensors] = {}
@@ -256,8 +253,6 @@ def add_request(
 
         if sampling_params.logprobs is not None:
             self.num_logprobs[req_id] = sampling_params.logprobs
-        if sampling_params.prompt_logprobs is not None:
-            self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs
         if sampling_params.logit_bias is not None:
             self.logit_bias[req_index] = sampling_params.logit_bias
 
@@ -317,7 +312,6 @@ def remove_request(self, req_id: str) -> int | None:
         self.repetition_penalties_reqs.discard(req_id)
         self.generators.pop(req_index, None)
         self.num_logprobs.pop(req_id, None)
-        self.num_prompt_logprobs.pop(req_id, None)
         self.in_progress_prompt_logprobs_cpu.pop(req_id, None)
 
         # LoRA
@@ -584,10 +578,6 @@ def no_penalties(self) -> bool:
     def max_num_logprobs(self) -> int | None:
         return max(self.num_logprobs.values()) if self.num_logprobs else None
 
-    @property
-    def no_prompt_logprob(self) -> bool:
-        return not self.num_prompt_logprobs
-
     @property
     def no_allowed_token_ids(self) -> bool:
         return len(self.has_allowed_token_ids) == 0
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 0f90578671db..f3dd9aa96d2a 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -17,11 +17,10 @@
 import torch_xla.runtime as xr
 
 import vllm.envs as envs
-from vllm.attention import Attention
 from vllm.attention.backends.abstract import AttentionType
-from vllm.attention.layer import MLAAttention
+from vllm.attention.layer import Attention, MLAAttention
 from vllm.attention.layers.chunked_local_attention import ChunkedLocalAttention
-from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.compilation.wrapper import TorchCompileWithNoGuardsWrapper
 from vllm.config import (
     ParallelConfig,
     VllmConfig,
@@ -216,12 +215,9 @@ def __init__(
         self.num_query_heads = model_config.get_num_attention_heads(parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
-        self.hidden_size = model_config.get_hidden_size()
+        self.inputs_embeds_size = model_config.get_inputs_embeds_size()
         self.vocab_size = model_config.get_vocab_size()
 
-        if self.lora_config is not None:
-            self.vocab_size += self.lora_config.lora_extra_vocab_size
-
         # Multi-modal data support
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
@@ -250,6 +246,9 @@ def __init__(
 
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
+        # NOTE(rob): num_prompt_logprobs only includes reqs
+        # that are currently in the prefill phase.
+        self.num_prompt_logprobs: dict[str, int] = {}
 
         # Initialize input batch early to avoid AttributeError in _update_states
         self.input_batch = InputBatch(
@@ -423,6 +422,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
         # Remove finished requests from the cached states.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.num_prompt_logprobs.pop(req_id, None)
 
         # Remove the finished requests from the persistent batch.
         # NOTE(woosuk): There could be an edge case where finished_req_ids and
@@ -480,6 +480,13 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
                 lora_request=new_req_data.lora_request,
             )
 
+            if sampling_params and sampling_params.prompt_logprobs is not None:
+                self.num_prompt_logprobs[req_id] = (
+                    self.input_batch.vocab_size
+                    if sampling_params.prompt_logprobs == -1
+                    else sampling_params.prompt_logprobs
+                )
+
             req_ids_to_add.append(req_id)
 
         # Update the states of the running/resumed requests.
@@ -575,7 +582,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
             format. Layers that do not need KV cache are not included.
         """
 
-        layers = get_layers_from_vllm_config(self.vllm_config, AttentionLayerBase)
+        layers = get_layers_from_vllm_config(
+            self.vllm_config,
+            AttentionLayerBase,  # type: ignore[type-abstract]
+        )
         block_size = self.vllm_config.cache_config.block_size
         cache_dtype_str = self.vllm_config.cache_config.cache_dtype
 
@@ -728,7 +738,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int)
             req_id = self.input_batch.req_ids[i]
             assert req_id is not None
             num_tokens = scheduler_output.num_scheduled_tokens[req_id]
-            if not use_max_model_len and num_tokens > self.most_model_len:
+            if (
+                not use_max_model_len
+                and self.most_model_len is not None
+                and num_tokens > self.most_model_len
+            ):
                 use_max_model_len = True
             num_scheduled_tokens_per_req.append(num_tokens)
         if use_max_model_len:
@@ -740,6 +754,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int)
             else:
                 end_index = num_reqs
         else:
+            assert self.num_reqs_most_model_len is not None
             if len(num_scheduled_tokens_per_req) > self.num_reqs_most_model_len:
                 num_scheduled_tokens_per_req = num_scheduled_tokens_per_req[
                     : self.num_reqs_most_model_len
@@ -832,6 +847,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput", start_index: int)
             ].to(self.device)
             seq_lens = self.seq_lens_cpu[: self.num_reqs_max_model_len].to(self.device)
         else:
+            assert self.num_reqs_most_model_len is not None
             block_tables = self.block_table_cpu[
                 : self.num_reqs_most_model_len, : self.num_blocks_per_most_len_req
             ]
@@ -934,6 +950,8 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
 
             for mm_input_id in encoder_input_ids:
                 mm_feature = req_state.mm_features[mm_input_id]
+                if mm_feature.data is None:
+                    continue
                 mm_hash = mm_feature.identifier
                 mm_kwargs.append(mm_feature.data)
                 mm_hashes_pos.append((mm_hash, mm_feature.mm_position))
@@ -1117,7 +1135,7 @@ def sample_tokens(
     ) -> ModelRunnerOutput:
         if self.scheduler_output is None:
             # Nothing to do (PP non-final rank case), output isn't used.
-            return None  # noqa
+            return None  # type: ignore[return-value]
         scheduler_output = self.scheduler_output
         mm_embed_inputs = self.mm_embed_inputs
         self.scheduler_output = None
@@ -1388,7 +1406,9 @@ def _dummy_run(self, num_tokens: int, num_reqs: int, num_blocks: int) -> None:
         if self.supports_mm_inputs:
             input_ids = None
             inputs_embeds = torch.zeros(
-                (num_tokens, self.hidden_size), dtype=self.dtype, device=self.device
+                (num_tokens, self.inputs_embeds_size),
+                dtype=self.dtype,
+                device=self.device,
             )
         else:
             input_ids = torch.zeros((num_tokens), dtype=torch.int32).to(self.device)
@@ -1697,7 +1717,8 @@ def profile_run(
     ) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.supports_mm_inputs:
-            if self.model_config.multimodal_config.skip_mm_profiling:
+            mm_config = self.model_config.multimodal_config
+            if mm_config is not None and mm_config.skip_mm_profiling:
                 logger.info(
                     "Skipping memory profiling for multimodal encoder and "
                     "encoder cache."
@@ -1895,12 +1916,14 @@ def reset_dynamo_cache(self):
             compiled_model = self.model.get_language_model().model
         else:
             compiled_model = self.model.model
-        if isinstance(compiled_model, TorchCompileWrapperWithCustomDispatcher):
+        if isinstance(compiled_model, TorchCompileWithNoGuardsWrapper):
             logger.info("Clear dynamo cache and cached dynamo bytecode.")
             torch._dynamo.eval_frame.remove_from_cache(
-                compiled_model.original_code_object
+                compiled_model.original_code_object()
             )
-            compiled_model.compiled_codes.clear()
+            # Reset the wrapper to re-initialize.
+            compiled_model.compiled = False
+            TorchCompileWithNoGuardsWrapper.__init__(compiled_model)
 
     @torch.compile(backend="openxla", fullgraph=True, dynamic=False)
     def select_hidden_states(self, hidden_states, indices_do_sample):
@@ -2165,5 +2188,9 @@ def _tpu_reset_lora(self, index: int):
         if isinstance(module, BaseLayerWithLoRA):
             module._original_set_lora = module.set_lora
             module._original_reset_lora = module.reset_lora
-            module.set_lora = _tpu_set_lora.__get__(module, module.__class__)
-            module.reset_lora = _tpu_reset_lora.__get__(module, module.__class__)
+            module.set_lora = _tpu_set_lora.__get__(  # type: ignore[method-assign]
+                module, module.__class__
+            )
+            module.reset_lora = _tpu_reset_lora.__get__(  # type: ignore[method-assign]
+                module, module.__class__
+            )
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index a716a9c3aa82..ce18ca6c3716 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -106,9 +106,6 @@ def __init__(
                 "Profiling enabled. Traces will be saved to: %s", self.profile_dir
             )
 
-        if self.model_config.seed is None:
-            self.model_config.seed = 0
-
     def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
         self.cache_config.num_gpu_blocks = num_gpu_blocks
         self.cache_config.num_cpu_blocks = num_cpu_blocks
@@ -141,8 +138,7 @@ def init_device(self):
 
         # Set random seed.
         set_random_seed(self.model_config.seed)
-        if self.model_config.seed is not None:
-            xm.set_rng_state(self.model_config.seed, self.device)
+        xm.set_rng_state(self.model_config.seed, self.device)
 
         # Increase the cache size limit, which is the maximum number of
         # dynamo graphs that can be compiled.
@@ -332,7 +328,7 @@ def _init_tpu_worker_distributed_environment(
             world_size=parallel_config.world_size,
             rank=rank,
             local_rank=local_rank,
-            distributed_init_method=distributed_init_method,
+            distributed_init_method=distributed_init_method or "env://",
             backend=current_platform.dist_backend,
         )
         ensure_model_parallel_initialized(
@@ -350,6 +346,6 @@ def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
 
 
 if USE_TPU_INFERENCE:
-    from tpu_inference.worker import TPUWorker as TpuInferenceWorker
+    from tpu_inference.worker.tpu_worker import TPUWorker as TpuInferenceWorker
 
     TPUWorker = TpuInferenceWorker  # type: ignore
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index 9f16b1e6d03e..be8326e2fdbc 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -27,8 +27,8 @@ def __init__(
         ready_barrier: threading.Barrier,
         cpu_wait_event: threading.Event,
         cpu_signal_event: threading.Event,
-        gpu_comm_done_event: torch.cuda.Event,
-        gpu_compute_done_event: torch.cuda.Event,
+        gpu_comm_done_event: torch.Event,
+        gpu_compute_done_event: torch.Event,
         schedule: str = "default",
     ):
         self.id = id
@@ -207,8 +207,8 @@ def make_ubatch_contexts(
     Create a context manager for micro-batching synchronization.
     """
     cpu_events = [threading.Event() for _ in range(num_micro_batches)]
-    gpu_comm_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)]
-    gpu_compute_done_events = [torch.cuda.Event() for _ in range(num_micro_batches)]
+    gpu_comm_done_events = [torch.Event() for _ in range(num_micro_batches)]
+    gpu_compute_done_events = [torch.Event() for _ in range(num_micro_batches)]
 
     assert len(forward_contexts) == 2
 
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 095407a8b959..bd88cb1b253f 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING
 
 import torch
 
 from vllm.attention.backends.abstract import AttentionBackend
+from vllm.attention.layer import Attention
 from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
 from vllm.model_executor.models.interfaces import MultiModalEmbeddings
 from vllm.model_executor.models.utils import extract_layer_index
@@ -17,9 +17,6 @@
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
 
-if TYPE_CHECKING:
-    from vllm.attention.layer import Attention
-
 
 class MultiModalBudget:
     """Helper class to calculate budget information for multi-modal models."""
@@ -278,9 +275,9 @@ def add_kv_sharing_layers_to_kv_cache_groups(
 
 def bind_kv_cache(
     kv_caches: dict[str, torch.Tensor],
-    forward_context: dict[str, "Attention"],
+    forward_context: dict[str, Attention],
     runner_kv_caches: list[torch.Tensor],
-    num_attn_module: int | None = 1,
+    num_attn_module: int = 1,
 ) -> None:
     """
     Bind the allocated KV cache to both ModelRunner and forward context so
@@ -316,7 +313,7 @@ def bind_kv_cache(
             # TODO - analyze where runner_kv_caches is used and the right
             # way to ensure it properly reflects multiple attention layers
             # in the same decoder block.
-            if current_platform.is_cuda() or current_platform.is_xpu():
+            if current_platform.is_cuda_alike() or current_platform.is_xpu():
                 # We know that the GPU runner is not impacted by this
                 # case. Some test code depends on runner_kv_caches, but
                 # not in a way that's impacted by ignoring this.
@@ -362,5 +359,7 @@ def is_residual_scattered_for_sp(
         or vllm_config.compilation_config.use_inductor_graph_partition
     ):
         return True
-
-    return num_input_tokens in vllm_config.compilation_config.compile_sizes
+    compile_sizes = vllm_config.compilation_config.compile_sizes
+    if compile_sizes is None:
+        return False
+    return num_input_tokens in compile_sizes
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index 30ea0ab77bd9..57e7037e946e 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -13,7 +13,6 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.cache import worker_receiver_cache_from_config
-from vllm.utils import warn_for_unimplemented_methods
 from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.system_utils import update_environment_variables
 from vllm.v1.kv_cache_interface import KVCacheSpec
@@ -33,7 +32,6 @@
 _R = TypeVar("_R")
 
 
-@warn_for_unimplemented_methods
 class WorkerBase:
     """Worker interface that allows vLLM to cleanly separate implementations for
     different hardware. Also abstracts control plane communication, e.g., to
@@ -182,6 +180,7 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         rpc_rank: int = 0,
+        global_rank: int | None = None,
     ) -> None:
         """
         Initialize the worker wrapper with the given vllm_config and rpc_rank.
@@ -194,6 +193,7 @@ def __init__(
         group.
         """
         self.rpc_rank = rpc_rank
+        self.global_rank = self.rpc_rank if global_rank is None else global_rank
         self.worker: WorkerBase | None = None
 
         # do not store this `vllm_config`, `init_worker` will set the final
@@ -314,11 +314,13 @@ def init_worker(self, all_kwargs: list[dict[str, Any]]) -> None:
             assert self.worker is not None
 
     def initialize_from_config(self, kv_cache_configs: list[Any]) -> None:
-        kv_cache_config = kv_cache_configs[self.rpc_rank]
+        kv_cache_config = kv_cache_configs[self.global_rank]
+        assert self.vllm_config is not None
         with set_current_vllm_config(self.vllm_config):
             self.worker.initialize_from_config(kv_cache_config)  # type: ignore
 
     def init_device(self):
+        assert self.vllm_config is not None
         with set_current_vllm_config(self.vllm_config):
             # To make vLLM config available during device initialization
             self.worker.init_device()  # type: ignore
diff --git a/vllm/v1/worker/xpu_model_runner.py b/vllm/v1/worker/xpu_model_runner.py
index 4f82c18da73a..30563305853a 100644
--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -37,19 +37,12 @@ def _sync_device(self) -> None:
 
 @contextmanager
 def _torch_cuda_wrapper():
-    class _EventPlaceholder:
-        def __init__(self, *args, **kwargs) -> None:
-            self.record = lambda: None
-            self.synchronize = lambda: None
-
     try:
         # replace cuda APIs with xpu APIs, this should work by default
-        torch.cuda.Event = torch.xpu.Event
         torch.cuda.Stream = torch.xpu.Stream
         torch.cuda.default_stream = torch.xpu.current_stream
         torch.cuda.current_stream = torch.xpu.current_stream
         torch.cuda.stream = torch.xpu.stream
         yield
     finally:
-        # if anything goes wrong, just patch it with a placeholder
-        torch.cuda.Event = _EventPlaceholder
+        pass
diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py
index 31fa3f3bd6ac..267369c73036 100644
--- a/vllm/v1/worker/xpu_worker.py
+++ b/vllm/v1/worker/xpu_worker.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from typing import Any
 
 import torch
 import torch.distributed
@@ -37,6 +38,7 @@ def __init__(
 
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        self.profiler: Any | None = None
         if envs.VLLM_TORCH_PROFILER_DIR:
             torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
             worker_name = f"{vllm_config.instance_id}-rank-{self.rank}"
@@ -62,7 +64,9 @@ def __init__(
                 with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
                 with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS,
                 on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, worker_name=worker_name, use_gzip=True
+                    torch_profiler_trace_dir,
+                    worker_name=worker_name,
+                    use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP,
                 ),
             )
         else:
@@ -148,7 +152,12 @@ def determine_available_memory(self) -> int:
         return int(available_kv_cache_memory)
 
     def init_device(self):
-        if self.device_config.device.type == "xpu" and current_platform.is_xpu():
+        device = self.device_config.device
+        if (
+            isinstance(device, torch.device)
+            and device.type == "xpu"
+            and current_platform.is_xpu()
+        ):
             self.device = torch.device(f"xpu:{self.local_rank}")
             current_platform.set_device(self.device)
             current_platform.check_if_supports_dtype(self.model_config.dtype)
@@ -159,12 +168,10 @@ def init_device(self):
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
-        ENV_CCL_ZE_IPC_EXCHANGE = os.getenv("CCL_ZE_IPC_EXCHANGE", "pidfd")
         ENV_CCL_ATL_TRANSPORT = os.getenv("CCL_ATL_TRANSPORT", "ofi")
         ENV_LOCAL_WORLD_SIZE = os.getenv(
             "LOCAL_WORLD_SIZE", str(self.parallel_config.world_size)
         )
-        os.environ["CCL_ZE_IPC_EXCHANGE"] = ENV_CCL_ZE_IPC_EXCHANGE
         os.environ["CCL_ATL_TRANSPORT"] = ENV_CCL_ATL_TRANSPORT
         os.environ["LOCAL_WORLD_SIZE"] = ENV_LOCAL_WORLD_SIZE
         os.environ["LOCAL_RANK"] = str(self.local_rank)